def msOutputProducer(self, requestRecords): """ A top level function to fetch requests from ReqMgr2, and produce the correspondent records for MSOutput in MongoDB. :param requestRecords: list of request dictionaries retrieved from ReqMgr2 It's implemented as a pipeline, performing the following sequential actions: 1) document transformer - creates a MSOutputTemplate object from the request dict 2) document info updater - parses the MSOutputTemplate object and updates the necessary data structure mapping output/locations/campaign/etc 3) document uploader - inserts the MSOutputTemplate object into the correct MongoDB collection (ReVal is separated from standard workflows) 4) document cleaner - releases memory reference to the MSOutputTemplate object """ # DONE: # to set a destructive function at the end of the pipeline # NOTE: # To discuss the collection names # NOTE: # Here we should never use docUploader with `update=True`, because # this will erase the latest state of already existing and fully or # partially processed documents by the Consumer pipeline self.logger.info("Running the msOutputProducer ...") msPipeline = Pipeline(name="MSOutputProducer Pipeline", funcLine=[ Functor(self.docTransformer), Functor(self.docInfoUpdate), Functor(self.docUploader), Functor(self.docCleaner) ]) # TODO: # To generate the object from within the Function scope see above. counter = 0 for request in viewvalues(requestRecords): if request['RequestName'] in self.requestNamesCached: # if it's cached, then it's already in MongoDB, no need to redo this thing! continue counter += 1 try: pipeLineName = msPipeline.getPipelineName() msPipeline.run(request) except (KeyError, TypeError) as ex: msg = "%s Possibly broken read from ReqMgr2 API or other. Err: %s." % ( pipeLineName, str(ex)) msg += " Continue to the next document." self.logger.exception(msg) continue except Exception as ex: msg = "%s General Error from pipeline. Err: %s. " % ( pipeLineName, str(ex)) msg += "Giving up Now." self.logger.exception(str(ex)) break return counter
def __init__(self, msConfig, logger=None): """ Runs the basic setup and initialization for the MSUnmerged module :param msConfig: micro service configuration """ super(MSUnmerged, self).__init__(msConfig, logger=logger) self.msConfig.setdefault("verbose", True) self.msConfig.setdefault("interval", 60) self.msConfig.setdefault("limitFilesPerRSE", 200) self.msConfig.setdefault("skipRSEs", []) self.msConfig.setdefault("rseExpr", "*") self.msConfig.setdefault("enableRealMode", False) self.msConfig.setdefault("dumpRSE", False) self.msConfig.setdefault("gfalLogLevel", 'normal') self.msConfig.setdefault("dirFilterIncl", []) self.msConfig.setdefault("dirFilterExcl", []) self.msConfig.setdefault("emulateGfal2", False) self.msConfig.setdefault("filesToDeleteSliceSize", 100) if self.msConfig['emulateGfal2'] is False and gfal2 is None: msg = "Failed to import gfal2 library while it's not " msg += "set to emulate it. Crashing the service!" raise ImportError(msg) # TODO: Add 'alertManagerUrl' to msConfig' # self.alertServiceName = "ms-unmerged" # self.alertManagerAPI = AlertManagerAPI(self.msConfig.get("alertManagerUrl", None), logger=logger) # Instantiating the Rucio Consistency Monitor Client self.rucioConMon = RucioConMon(self.msConfig['rucioConMon'], logger=self.logger) self.wmstatsSvc = WMStatsServer(self.msConfig['wmstatsUrl'], logger=self.logger) # Building all the Pipelines: pName = 'plineUnmerged' self.plineUnmerged = Pipeline(name=pName, funcLine=[Functor(self.updateRSETimestamps, start=True, end=False), Functor(self.consRecordAge), Functor(self.getUnmergedFiles), Functor(self.filterUnmergedFiles), Functor(self.getPfn), Functor(self.cleanRSE), Functor(self.updateRSECounters, pName), Functor(self.updateRSETimestamps, start=False, end=True), Functor(self.purgeRseObj, dumpRSE=self.msConfig['dumpRSE'])]) # Initialization of the deleted files counters: self.rseCounters = {} self.plineCounters = {} self.rseTimestamps = {} self.rseConsStats = {} self.protectedLFNs = [] # The basic /store/unmerged regular expression: self.regStoreUnmergedLfn = re.compile("^/store/unmerged/.*$") self.regStoreUnmergedPfn = re.compile("^.+/store/unmerged/.*$")
def __init__(self, msConfig, logger=None): """ Runs the basic setup and initialization for the MSUnmerged module :param msConfig: micro service configuration """ super(MSUnmerged, self).__init__(msConfig, logger=logger) self.msConfig.setdefault("verbose", True) self.msConfig.setdefault("interval", 60) # self.msConfig.setdefault('limitRSEsPerInstance', 100) # self.msConfig.setdefault('limitTiersPerInstance', ['T1', 'T2', 'T3']) # self.msConfig.setdefault("rucioAccount", "FIXME_RUCIO_ACCT") self.msConfig.setdefault("rseExpr", "*") # TODO: Add 'alertManagerUrl' to msConfig' # self.alertServiceName = "ms-unmerged" # self.alertManagerAPI = AlertManagerAPI(self.msConfig.get("alertManagerUrl", None), logger=logger) # Building all the Pipelines: pName = 'plineUnmerged' self.plineUnmerged = Pipeline(name=pName, funcLine=[Functor(self.cleanFiles)]) # Initialization of the deleted files counters: self.rseCounters = {} self.plineCounters = {}
def __init__(self, msConfig, logger=None): """ Runs the basic setup and initialization for the MSRuleCleaner module :param msConfig: micro service configuration """ super(MSRuleCleaner, self).__init__(msConfig, logger=logger) self.msConfig.setdefault("verbose", True) self.msConfig.setdefault("interval", 60) self.msConfig.setdefault("services", ['ruleCleaner']) self.msConfig.setdefault("rucioWmaAccount", "wma_test") self.msConfig.setdefault("rucioMStrAccount", "wmcore_transferor") self.msConfig.setdefault('enableRealMode', False) self.mode = "RealMode" if self.msConfig[ 'enableRealMode'] else "DryRunMode" self.emailAlert = EmailAlert(self.msConfig) self.curlMgr = RequestHandler() # Building all the Pipelines: pName = 'plineMSTrCont' self.plineMSTrCont = Pipeline(name=pName, funcLine=[ Functor(self.setPlineMarker, pName), Functor(self.cleanRucioRules) ]) pName = 'plineMSTrBlock' self.plineMSTrBlock = Pipeline(name=pName, funcLine=[ Functor(self.setPlineMarker, pName), Functor(self.cleanRucioRules) ]) pName = 'plineAgentCont' self.plineAgentCont = Pipeline( name=pName, funcLine=[ Functor(self.setPlineMarker, pName), Functor(self.getRucioRules, 'container', self.msConfig['rucioWmaAccount']), Functor(self.cleanRucioRules) ]) pName = 'plineAgentBlock' self.plineAgentBlock = Pipeline( name=pName, funcLine=[ Functor(self.setPlineMarker, pName), Functor(self.getRucioRules, 'block', self.msConfig['rucioWmaAccount']), Functor(self.cleanRucioRules) ]) pName = 'plineArchive' self.plineArchive = Pipeline(name=pName, funcLine=[ Functor(self.setPlineMarker, pName), Functor(self.setClean), Functor(self.archive) ]) # Building the different set of plines we will need later: # NOTE: The following are all the functional pipelines which are supposed to include # a cleanup function and report cleanup status in the MSRuleCleanerWflow object self.cleanuplines = [ self.plineMSTrCont, self.plineMSTrBlock, self.plineAgentCont, self.plineAgentBlock ] # Building an auxiliary list of cleanup pipeline names only: self.cleanupPipeNames = [pline.name for pline in self.cleanuplines] # Building lists of pipelines related only to Agents or MStransferror self.agentlines = [self.plineAgentCont, self.plineAgentBlock] self.mstrlines = [self.plineMSTrCont, self.plineMSTrBlock] # Initialization of the 'cleaned' and 'archived' counters: self.wfCounters = {'cleaned': {}, 'archived': 0}
class MSRuleCleaner(MSCore): """ MSRuleCleaner.py class provides the logic used to clean the Rucio block level data placement rules created by WMAgent. """ def __init__(self, msConfig, logger=None): """ Runs the basic setup and initialization for the MSRuleCleaner module :param msConfig: micro service configuration """ super(MSRuleCleaner, self).__init__(msConfig, logger=logger) self.msConfig.setdefault("verbose", True) self.msConfig.setdefault("interval", 60) self.msConfig.setdefault("services", ['ruleCleaner']) self.msConfig.setdefault("rucioWmaAccount", "wma_test") self.msConfig.setdefault("rucioMStrAccount", "wmcore_transferor") self.msConfig.setdefault('enableRealMode', False) self.mode = "RealMode" if self.msConfig[ 'enableRealMode'] else "DryRunMode" self.emailAlert = EmailAlert(self.msConfig) self.curlMgr = RequestHandler() # Building all the Pipelines: pName = 'plineMSTrCont' self.plineMSTrCont = Pipeline(name=pName, funcLine=[ Functor(self.setPlineMarker, pName), Functor(self.cleanRucioRules) ]) pName = 'plineMSTrBlock' self.plineMSTrBlock = Pipeline(name=pName, funcLine=[ Functor(self.setPlineMarker, pName), Functor(self.cleanRucioRules) ]) pName = 'plineAgentCont' self.plineAgentCont = Pipeline( name=pName, funcLine=[ Functor(self.setPlineMarker, pName), Functor(self.getRucioRules, 'container', self.msConfig['rucioWmaAccount']), Functor(self.cleanRucioRules) ]) pName = 'plineAgentBlock' self.plineAgentBlock = Pipeline( name=pName, funcLine=[ Functor(self.setPlineMarker, pName), Functor(self.getRucioRules, 'block', self.msConfig['rucioWmaAccount']), Functor(self.cleanRucioRules) ]) pName = 'plineArchive' self.plineArchive = Pipeline(name=pName, funcLine=[ Functor(self.setPlineMarker, pName), Functor(self.setClean), Functor(self.archive) ]) # Building the different set of plines we will need later: # NOTE: The following are all the functional pipelines which are supposed to include # a cleanup function and report cleanup status in the MSRuleCleanerWflow object self.cleanuplines = [ self.plineMSTrCont, self.plineMSTrBlock, self.plineAgentCont, self.plineAgentBlock ] # Building an auxiliary list of cleanup pipeline names only: self.cleanupPipeNames = [pline.name for pline in self.cleanuplines] # Building lists of pipelines related only to Agents or MStransferror self.agentlines = [self.plineAgentCont, self.plineAgentBlock] self.mstrlines = [self.plineMSTrCont, self.plineMSTrBlock] # Initialization of the 'cleaned' and 'archived' counters: self.wfCounters = {'cleaned': {}, 'archived': 0} def resetCounters(self): """ A simple function for zeroing the cleaned and archived counters. """ for pline in self.cleanuplines: self.wfCounters['cleaned'][pline.name] = 0 self.wfCounters['archived'] = 0 def execute(self, reqStatus): """ Executes the whole ruleCleaner logic :return: summary """ # start threads in MSManager which should call this method summary = dict(RULECLEANER_REPORT) self.currThread = current_thread() self.currThreadIdent = self.currThread.name self.updateReportDict(summary, "thread_id", self.currThreadIdent) self.resetCounters() self.logger.info("MSRuleCleaner is running in mode: %s.", self.mode) # Build the list of workflows to work on: try: requestRecords = {} for status in reqStatus: requestRecords.update(self.getRequestRecords(status)) except Exception as err: # general error msg = "Unknown exception while fetching requests from ReqMgr2. Error: %s", str( err) self.logger.exception(msg) self.updateReportDict(summary, "error", msg) # Call _execute() and feed the relevant pipeline with the objects popped from requestRecords try: totalNumRequests, cleanNumRequests, archivedNumRequests = self._execute( requestRecords) msg = "\nNumber of processed workflows: %s." msg += "\nNumber of properly cleaned workflows: %s." msg += "\nNumber of archived workflows: %s." self.logger.info(msg, totalNumRequests, cleanNumRequests, archivedNumRequests) self.updateReportDict(summary, "total_num_requests", totalNumRequests) self.updateReportDict(summary, "clean_num_requests", cleanNumRequests) self.updateReportDict(summary, "archived_num_requests", archivedNumRequests) except Exception as ex: msg = "Unknown exception while running MSRuleCleaner thread Error: %s" self.logger.exception(msg, str(ex)) self.updateReportDict(summary, "error", msg) return summary def _execute(self, reqRecords): """ Executes the MSRuleCleaner pipelines based on the workflow status :param reqList: A list of RequestRecords to work on :return: a tuple with: number of properly cleaned requests number of processed workflows number of archived workflows """ # NOTE: The Input Cleanup, the Block Level Cleanup and the Archival # Pipelines are executed sequentially in the above order. # This way we assure ourselves that we archive only workflows # that have accomplished the needed cleanup cleanNumRequests = 0 totalNumRequests = 0 # Call the workflow dispatcher: for _, req in reqRecords.items(): wflow = MSRuleCleanerWflow(req) self._dispatchWflow(wflow) msg = "\n----------------------------------------------------------" msg += "\nMSRuleCleanerWflow: %s" msg += "\n----------------------------------------------------------" self.logger.debug(msg, pformat(wflow)) totalNumRequests += 1 if self._checkClean(wflow): cleanNumRequests += 1 # Report the counters: for pline in self.cleanuplines: msg = "Workflows cleaned by pipeline: %s: %d" self.logger.info(msg, pline.name, self.wfCounters['cleaned'][pline.name]) archivedNumRequests = self.wfCounters['archived'] self.logger.info("Workflows archived: %d", self.wfCounters['archived']) return totalNumRequests, cleanNumRequests, archivedNumRequests def _dispatchWflow(self, wflow): """ A function intended to dispatch a workflow (e.g based on its status) through one or more functional pipelines in case there is some more complicated logic involved in the order we execute them but not just a sequentially """ self.logger.debug("Dispatching workflow: %s", wflow['RequestName']) # NOTE: The following dispatch logic is a subject to be changed at any time # Resolve: # NOTE: First resolve any preliminary flags that will be needed further # in the logic of the _dispatcher() itself if wflow['RequestStatus'] == 'announced': self.getMSOutputTransferInfo(wflow) # Clean: # Do not clean any Resubmission, but still let them be archived if wflow['RequestType'] == 'Resubmission': wflow['ForceArchive'] = True msg = "Skipping cleanup step for workflow: %s - RequestType is %s." msg += " Will try to archive it directly." self.logger.info(msg, wflow['RequestName'], wflow['RequestType']) elif wflow['RequestStatus'] in ['rejected', 'aborted-completed']: # NOTE: We do not check the ParentageResolved flag for these # workflows, but we do need to clean output data placement # rules from the agents for them for pline in self.agentlines: try: pline.run(wflow) except Exception as ex: msg = "%s: General error from pipeline. Workflow: %s. Error: \n%s. " msg += "\nWill retry again in the next cycle." self.logger.exception(msg, pline.name, wflow['RequestName'], str(ex)) continue if wflow['CleanupStatus'][pline.name]: self.wfCounters['cleaned'][pline.name] += 1 elif wflow['RequestStatus'] == 'announced' and not wflow[ 'ParentageResolved']: # NOTE: We skip workflows which are not having 'ParentageResolved' # flag, but we still need some proper logging for them. msg = "Skipping workflow: %s - 'ParentageResolved' flag set to false." msg += " Will retry again in the next cycle." self.logger.info(msg, wflow['RequestName']) elif wflow[ 'RequestStatus'] == 'announced' and not wflow['TransferDone']: # NOTE: We skip workflows which have not yet finalised their TransferStatus # in MSOutput, but we still need some proper logging for them. msg = "Skipping workflow: %s - 'TransferStatus' is 'pending' or 'TransferInfo' is missing in MSOutput." msg += " Will retry again in the next cycle." self.logger.info(msg, wflow['RequestName']) elif wflow['RequestStatus'] == 'announced': for pline in self.cleanuplines: try: pline.run(wflow) except Exception as ex: msg = "%s: General error from pipeline. Workflow: %s. Error: \n%s. " msg += "\nWill retry again in the next cycle." self.logger.exception(msg, pline.name, wflow['RequestName'], str(ex)) continue if wflow['CleanupStatus'][pline.name]: self.wfCounters['cleaned'][pline.name] += 1 else: # We shouldn't be here: msg = "Skipping workflow: %s - " msg += "Does not fall under any of the defined categories." self.logger.error(msg, wflow['RequestName']) # Archive: try: self.plineArchive.run(wflow) self.wfCounters['archived'] += 1 except MSRuleCleanerArchival as ex: msg = "%s: Archival Error: %s. " msg += " Will retry again in the next cycle." self.logger.error(msg, wflow['PlineMarkers'][-1], ex.message()) except Exception as ex: msg = "%s General error from pipeline. Workflow: %s. Error: \n%s. " msg += "\nWill retry again in the next cycle." self.logger.exception(msg, wflow['PlineMarkers'][-1], wflow['RequestName'], str(ex)) def setPlineMarker(self, wflow, pName): """ A function intended to mark which is the pipeline currently working on the workflow. It is supposed to be called always as a first function in the pipeline. :param wflow: A MSRuleCleaner workflow representation :param pName: The name of the functional pipeline :return wflow: """ # NOTE: The current functional pipeline MUST always be appended at the # end of the 'PlineMarkers' list # First get rid of the default: if not wflow['PlineMarkers']: wflow['PlineMarkers'] = [] # Then push our current value into the markers list: wflow['PlineMarkers'].append(pName) # Populate the list of flags to be used later: if pName not in wflow['RulesToClean']: if pName in self.cleanupPipeNames: wflow['RulesToClean'][pName] = [] if pName not in wflow['CleanupStatus']: if pName in self.cleanupPipeNames: wflow['CleanupStatus'][pName] = False return wflow def _checkClean(self, wflow): """ An auxiliary function used to only check the temporary cleanup status. It basically takes the pipelines registered in 'PlineMarkers' that have already worked on the workflow as a mask and applies this mask over the set of flags in the 'CleanupStatus' field and then reduces the result to a single bool value """ # NOTE: This is one of the few functions taking a workflow as an argument # but returning a bool, since it is an auxiliary function and is not # supposed to be called as a standalone function in a pipeline. # NOTE: `all([]) == True`, ergo all the 'rejected' && 'aborted-completed' workflows # are also counted as properly cleaned and can trigger archival later # Build a list of bool flags based on the mask of PlineMarkers cleanFlagsList = [ wflow['CleanupStatus'][key] for key in wflow['PlineMarkers'] if key in wflow['CleanupStatus'].keys() ] # If no one have worked on the workflow set the clean status to false if not wflow['PlineMarkers']: cleanStatus = False # If we have a mask longer than the list of flags avoid false positives # because of the behavior explained above - `all([]) == True` elif not cleanFlagsList: cleanStatus = False # Figure out the final value else: cleanStatus = all(cleanFlagsList) return cleanStatus def setClean(self, wflow): """ A function to set the 'IsClean' flag based on the status from all the pipelines which have worked on the workflow (and have put their markers in the 'PlineMarkers' list) :param wflow: A MSRuleCleaner workflow representation :return wflow: """ wflow['IsClean'] = self._checkClean(wflow) return wflow def archive(self, wflow): """ Move the workflow to the proper archived status after checking the full cleanup status :param wflow: A MSRuleCleaner workflow representation :param archStatus: Target status to transition after archival :return wflow: """ # NOTE: check allowed status transitions with: # https://github.com/dmwm/WMCore/blob/5961d2229b1e548e58259c06af154f33bce36c68/src/python/WMCore/ReqMgr/DataStructs/RequestStatus.py#L171 if not (wflow['IsClean'] or wflow['ForceArchive']): msg = "Not properly cleaned workflow: %s" % wflow['RequestName'] raise MSRuleCleanerArchival(msg) # Check the available status transitions before we decide the final status targetStatusList = RequestStatus.REQUEST_STATE_TRANSITION.get( wflow['RequestStatus'], []) self.logger.info("targetStatusList: %s", targetStatusList) return wflow def getMSOutputTransferInfo(self, wflow): """ Fetches the transfer information from the MSOutput REST interface for the given workflow. :param wflow: A MSRuleCleaner workflow representation :return wflow: """ headers = {'Accept': 'application/json'} params = {} url = '%s/data/info?request=%s' % (self.msConfig['msOutputUrl'], wflow['RequestName']) try: res = self.curlMgr.getdata(url, params=params, headers=headers, ckey=ckey(), cert=cert()) data = json.loads(res)['result'][0] transferInfo = data['transferDoc'] except Exception as ex: msg = "General exception while fetching TransferInfo from MSOutput for %s. " msg += "Error: %s" self.logger.exception(msg, wflow['RequestName'], str(ex)) if transferInfo is not None and transferInfo[ 'TransferStatus'] == 'done': wflow['TransferDone'] = True return wflow def getRucioRules(self, wflow, gran, rucioAcct): """ Queries Rucio and builds the relevant list of blocklevel rules for the given workflow :param wflow: A MSRuleCleaner workflow representation :param gran: Data granularity to search for Rucio rules. Possible values: 'block' || 'container' :return: wflow """ currPline = wflow['PlineMarkers'][-1] # Find all the output placement rules created by the agents for dataCont in wflow['OutputDatasets']: if gran == 'container': for rule in self.rucio.listDataRules(dataCont, account=rucioAcct): wflow['RulesToClean'][currPline].append(rule['id']) elif gran == 'block': try: blocks = self.rucio.getBlocksInContainer(dataCont) for block in blocks: for rule in self.rucio.listDataRules( block, account=rucioAcct): wflow['RulesToClean'][currPline].append(rule['id']) except WMRucioDIDNotFoundException: msg = "Container: %s not found in Rucio for workflow: %s." self.logger.info(msg, dataCont, wflow['RequestName']) return wflow def cleanRucioRules(self, wflow): """ Cleans all the Rules present in the field 'RulesToClean' in the MSRuleCleaner workflow representation. And fills the relevant Cleanup Status. :param wflow: A MSRuleCleaner workflow representation :return: wflow """ # NOTE: The function should be called independently and sequentially from # The Input and the respective BlockLevel pipelines. # NOTE: The current functional pipeline is always the last one in the PlineMarkers list currPline = wflow['PlineMarkers'][-1] delResults = [] if self.msConfig['enableRealMode']: for rule in wflow['RulesToClean'][currPline]: self.logger.info("%s: Deleting ruleId: %s ", currPline, rule) delResult = self.rucio.deleteRule(rule) delResults.append(delResult) if not delResult: self.logger.warning("%s: Failed to delete ruleId: %s ", currPline, rule) else: for rule in wflow['RulesToClean'][currPline]: delResults.append(True) self.logger.info("%s: DRY-RUN: Is about to delete ruleId: %s ", currPline, rule) # Set the cleanup flag: wflow['CleanupStatus'][currPline] = all(delResults) # ---------------------------------------------------------------------- # FIXME : To be removed once the plineMSTrBlock && plineMSTrCont are # developed if wflow['CleanupStatus'][currPline] in [ 'plineMSTrBlock', 'plineMSTrCont' ]: wflow['CleanupStatus'][currPline] = True # ---------------------------------------------------------------------- return wflow def getRequestRecords(self, reqStatus): """ Queries ReqMgr2 for requests in a given status. :param reqStatus: The status for the requests to be fetched from ReqMgr2 :return requests: A dictionary with all the workflows in the given status """ self.logger.info("Fetching requests in status: %s", reqStatus) result = self.reqmgr2.getRequestByStatus([reqStatus], detail=True) if not result: requests = {} else: requests = result[0] self.logger.info(' retrieved %s requests in status: %s', len(requests), reqStatus) return requests
class MSRuleCleaner(MSCore): """ MSRuleCleaner.py class provides the logic used to clean the Rucio block level data placement rules created by WMAgent. """ def __init__(self, msConfig, logger=None): """ Runs the basic setup and initialization for the MSRuleCleaner module :param msConfig: micro service configuration """ super(MSRuleCleaner, self).__init__(msConfig, logger=logger) self.msConfig.setdefault("verbose", True) self.msConfig.setdefault("interval", 60) self.msConfig.setdefault("services", ['ruleCleaner']) self.msConfig.setdefault("rucioWmaAccount", "wma_test") self.msConfig.setdefault("rucioMStrAccount", "wmcore_transferor") self.msConfig.setdefault('enableRealMode', False) self.mode = "RealMode" if self.msConfig['enableRealMode'] else "DryRunMode" self.curlMgr = RequestHandler() self.targetStatusRegex = re.compile(r'.*archived') self.logDB = LogDB(self.msConfig["logDBUrl"], self.msConfig["logDBReporter"], logger=self.logger) self.wmstatsSvc = WMStatsServer(self.msConfig['wmstatsUrl'], logger=self.logger) # Building all the Pipelines: pName = 'plineMSTrCont' self.plineMSTrCont = Pipeline(name=pName, funcLine=[Functor(self.setPlineMarker, pName), Functor(self.setParentDatasets), Functor(self.getRucioRules, 'container', self.msConfig['rucioMStrAccount']), Functor(self.cleanRucioRules)]) pName = 'plineMSTrBlock' self.plineMSTrBlock = Pipeline(name=pName, funcLine=[Functor(self.setPlineMarker, pName), Functor(self.setParentDatasets), Functor(self.getRucioRules, 'block', self.msConfig['rucioMStrAccount']), Functor(self.cleanRucioRules)]) pName = 'plineAgentCont' self.plineAgentCont = Pipeline(name=pName, funcLine=[Functor(self.setPlineMarker, pName), Functor(self.getRucioRules, 'container', self.msConfig['rucioWmaAccount']), Functor(self.cleanRucioRules)]) pName = 'plineAgentBlock' self.plineAgentBlock = Pipeline(name=pName, funcLine=[Functor(self.setPlineMarker, pName), Functor(self.getRucioRules, 'block', self.msConfig['rucioWmaAccount']), Functor(self.cleanRucioRules)]) pName = 'plineArchive' self.plineArchive = Pipeline(name=pName, funcLine=[Functor(self.setPlineMarker, pName), Functor(self.findTargetStatus), Functor(self.setClean), Functor(self.setArchivalDelayExpired), Functor(self.setLogDBClean), Functor(self.archive)]) # Building the different set of plines we will need later: # NOTE: The following are all the functional pipelines which are supposed to include # a cleanup function and report cleanup status in the MSRuleCleanerWflow object self.cleanuplines = [self.plineMSTrCont, self.plineMSTrBlock, self.plineAgentCont, self.plineAgentBlock] # Building an auxiliary list of cleanup pipeline names only: self.cleanupPipeNames = [pline.name for pline in self.cleanuplines] # Building lists of pipelines related only to Agents or MStransferror self.agentlines = [self.plineAgentCont, self.plineAgentBlock] self.mstrlines = [self.plineMSTrCont, self.plineMSTrBlock] # Initialization of the 'cleaned' and 'archived' counters: self.wfCounters = {'cleaned': {}, 'archived': {'normalArchived': 0, 'forceArchived': 0}} self.globalLocks = set() def getGlobalLocks(self): """ Fetches the list of 'globalLocks' from wmstats server and the list of 'parentLocks' from request manager. Stores/updates the unified set in the 'globalLocks' instance variable. Returns the resultant unified set. :return: A union set of the 'globalLocks' and the 'parentLocks' lists """ self.logger.info("Fetching globalLocks list from wmstats server.") try: globalLocks = set(self.wmstatsSvc.getGlobalLocks()) except Exception as ex: msg = "Failed to refresh global locks list for the current polling cycle. Error: %s " msg += "Skipping this polling cycle." self.logger.error(msg, str(ex)) raise ex self.logger.info("Fetching parentLocks list from reqmgr2 server.") try: parentLocks = set(self.reqmgr2.getParentLocks()) except Exception as ex: msg = "Failed to refresh parent locks list for the current poling cycle. Error: %s " msg += "Skipping this polling cycle." self.logger.error(msg, str(ex)) raise ex self.globalLocks = globalLocks | parentLocks def resetCounters(self): """ A simple function for zeroing the cleaned and archived counters. """ for pline in self.cleanuplines: self.wfCounters['cleaned'][pline.name] = 0 self.wfCounters['archived']['normalArchived'] = 0 self.wfCounters['archived']['forceArchived'] = 0 def execute(self, reqStatus): """ Executes the whole ruleCleaner logic :return: summary """ # start threads in MSManager which should call this method summary = dict(RULECLEANER_REPORT) self.currThread = current_thread() self.currThreadIdent = self.currThread.name self.updateReportDict(summary, "thread_id", self.currThreadIdent) self.resetCounters() self.logger.info("MSRuleCleaner is running in mode: %s.", self.mode) # Build the list of workflows to work on: try: requestRecords = {} for status in reqStatus: requestRecords.update(self.getRequestRecords(status)) except Exception as err: # general error msg = "Unknown exception while fetching requests from ReqMgr2. Error: %s", str(err) self.logger.exception(msg) self.updateReportDict(summary, "error", msg) # Call _execute() and feed the relevant pipeline with the objects popped from requestRecords try: self.getGlobalLocks() totalNumRequests, cleanNumRequests, normalArchivedNumRequests, forceArchivedNumRequests = self._execute(requestRecords) msg = "\nNumber of processed workflows: %s." msg += "\nNumber of properly cleaned workflows: %s." msg += "\nNumber of normally archived workflows: %s." msg += "\nNumber of force archived workflows: %s." self.logger.info(msg, totalNumRequests, cleanNumRequests, normalArchivedNumRequests, forceArchivedNumRequests) self.updateReportDict(summary, "total_num_requests", totalNumRequests) self.updateReportDict(summary, "clean_num_requests", cleanNumRequests) self.updateReportDict(summary, "normal_archived_num_requests", normalArchivedNumRequests) self.updateReportDict(summary, "force_archived_num_requests", forceArchivedNumRequests) except Exception as ex: msg = "Unknown exception while running MSRuleCleaner thread Error: %s" self.logger.exception(msg, str(ex)) self.updateReportDict(summary, "error", msg) return summary def _execute(self, reqRecords): """ Executes the MSRuleCleaner pipelines based on the workflow status :param reqList: A list of RequestRecords to work on :return: a tuple with: number of properly cleaned requests number of processed workflows number of archived workflows """ # NOTE: The Input Cleanup, the Block Level Cleanup and the Archival # Pipelines are executed sequentially in the above order. # This way we assure ourselves that we archive only workflows # that have accomplished the needed cleanup cleanNumRequests = 0 totalNumRequests = 0 # Call the workflow dispatcher: for req in viewvalues(reqRecords): wflow = MSRuleCleanerWflow(req) self._dispatchWflow(wflow) msg = "\n----------------------------------------------------------" msg += "\nMSRuleCleanerWflow: %s" msg += "\n----------------------------------------------------------" self.logger.debug(msg, pformat(wflow)) totalNumRequests += 1 if self._checkClean(wflow): cleanNumRequests += 1 # Report the counters: for pline in self.cleanuplines: msg = "Workflows cleaned by pipeline: %s: %d" self.logger.info(msg, pline.name, self.wfCounters['cleaned'][pline.name]) normalArchivedNumRequests = self.wfCounters['archived']['normalArchived'] forceArchivedNumRequests = self.wfCounters['archived']['forceArchived'] self.logger.info("Workflows normally archived: %d", self.wfCounters['archived']['normalArchived']) self.logger.info("Workflows force archived: %d", self.wfCounters['archived']['forceArchived']) return totalNumRequests, cleanNumRequests, normalArchivedNumRequests, forceArchivedNumRequests def _dispatchWflow(self, wflow): """ A function intended to dispatch a workflow (e.g based on its status) through one or more functional pipelines in case there is some more complicated logic involved in the order we execute them but not just a sequentially """ self.logger.debug("Dispatching workflow: %s", wflow['RequestName']) # NOTE: The following dispatch logic is a subject to be changed at any time # Resolve: # NOTE: First resolve any preliminary flags that will be needed further # in the logic of the _dispatcher() itself if wflow['RequestStatus'] == 'announced': self.getMSOutputTransferInfo(wflow) # Clean: # Do not clean any Resubmission, but still let them be archived if wflow['RequestType'] == 'Resubmission': wflow['ForceArchive'] = True msg = "Skipping cleanup step for workflow: %s - RequestType is %s." msg += " Will try to archive it directly." self.logger.info(msg, wflow['RequestName'], wflow['RequestType']) elif wflow['RequestStatus'] in ['rejected', 'aborted-completed']: # NOTE: We do not check the ParentageResolved flag for these # workflows, but we do need to clean output data placement # rules from the agents for them for pline in self.agentlines: try: pline.run(wflow) except Exception as ex: msg = "%s: General error from pipeline. Workflow: %s. Error: \n%s. " msg += "\nWill retry again in the next cycle." self.logger.exception(msg, pline.name, wflow['RequestName'], str(ex)) continue if wflow['CleanupStatus'][pline.name]: self.wfCounters['cleaned'][pline.name] += 1 elif wflow['RequestStatus'] == 'announced' and not wflow['ParentageResolved']: # NOTE: We skip workflows which are not having 'ParentageResolved' # flag, but we still need some proper logging for them. msg = "Skipping workflow: %s - 'ParentageResolved' flag set to false." msg += " Will retry again in the next cycle." self.logger.info(msg, wflow['RequestName']) elif wflow['RequestStatus'] == 'announced' and not wflow['TransferDone']: # NOTE: We skip workflows which have not yet finalised their TransferStatus # in MSOutput, but we still need some proper logging for them. msg = "Skipping workflow: %s - 'TransferStatus' is 'pending' or 'TransferInfo' is missing in MSOutput." msg += " Will retry again in the next cycle." self.logger.info(msg, wflow['RequestName']) elif wflow['RequestStatus'] == 'announced' and not wflow['TransferTape']: # NOTE: We skip workflows which have not yet finalised their tape transfers. # (i.e. even if a single output which is supposed to be covered # by a tape rule is in any of the following transient states: # {REPLICATING, STUCK, SUSPENDED, WAITING_APPROVAL}.) # We still need some proper logging for them. msg = "Skipping workflow: %s - tape transfers are not yet completed." msg += " Will retry again in the next cycle." self.logger.info(msg, wflow['RequestName']) elif wflow['RequestStatus'] == 'announced': for pline in self.cleanuplines: try: pline.run(wflow) except MSRuleCleanerResolveParentError as ex: msg = "%s: Parentage Resolve Error: %s. " msg += "Will retry again in the next cycle." self.logger.error(msg, pline.name, str(ex)) continue except Exception as ex: msg = "%s: General error from pipeline. Workflow: %s. Error: \n%s. " msg += "\nWill retry again in the next cycle." self.logger.exception(msg, pline.name, wflow['RequestName'], str(ex)) continue if wflow['CleanupStatus'][pline.name]: self.wfCounters['cleaned'][pline.name] += 1 else: # We shouldn't be here: msg = "Skipping workflow: %s - " msg += "Does not fall under any of the defined categories." self.logger.error(msg, wflow['RequestName']) # Archive: try: self.plineArchive.run(wflow) if wflow['ForceArchive']: self.wfCounters['archived']['forceArchived'] += 1 else: self.wfCounters['archived']['normalArchived'] += 1 except MSRuleCleanerArchivalSkip as ex: msg = "%s: Proper conditions not met: %s. " msg += "Skipping archival in the current cycle." self.logger.info(msg, wflow['PlineMarkers'][-1], str(ex)) except MSRuleCleanerArchivalError as ex: msg = "%s: Archival Error: %s. " msg += "Will retry again in the next cycle." self.logger.error(msg, wflow['PlineMarkers'][-1], str(ex)) except Exception as ex: msg = "%s General error from pipeline. Workflow: %s. Error: \n%s. " msg += "\nWill retry again in the next cycle." self.logger.exception(msg, wflow['PlineMarkers'][-1], wflow['RequestName'], str(ex)) def setPlineMarker(self, wflow, pName): """ A function intended to mark which is the pipeline currently working on the workflow. It is supposed to be called always as a first function in the pipeline. :param wflow: A MSRuleCleaner workflow representation :param pName: The name of the functional pipeline :return: The workflow object """ # NOTE: The current functional pipeline MUST always be appended at the # end of the 'PlineMarkers' list # First get rid of the default: if not wflow['PlineMarkers']: wflow['PlineMarkers'] = [] # Then push our current value into the markers list: wflow['PlineMarkers'].append(pName) # Populate the list of flags to be used later: if pName not in wflow['RulesToClean']: if pName in self.cleanupPipeNames: wflow['RulesToClean'][pName] = [] if pName not in wflow['CleanupStatus']: if pName in self.cleanupPipeNames: wflow['CleanupStatus'][pName] = False return wflow def _checkClean(self, wflow): """ An auxiliary function used to only check the temporary cleanup status. It basically takes the pipelines registered in 'PlineMarkers' that have already worked on the workflow as a mask and applies this mask over the set of flags in the 'CleanupStatus' field and then reduces the result to a single bool value """ # NOTE: This is one of the few functions taking a workflow as an argument # but returning a bool, since it is an auxiliary function and is not # supposed to be called as a standalone function in a pipeline. # NOTE: `all([]) == True`, ergo all the 'rejected' && 'aborted-completed' workflows # are also counted as properly cleaned and can trigger archival later # Build a list of bool flags based on the mask of PlineMarkers cleanFlagsList = [wflow['CleanupStatus'][key] for key in wflow['PlineMarkers'] if key in wflow['CleanupStatus']] # If no one have worked on the workflow set the clean status to false if not wflow['PlineMarkers']: cleanStatus = False # If we have a mask longer than the list of flags avoid false positives # because of the behavior explained above - `all([]) == True` elif not cleanFlagsList: cleanStatus = False # Figure out the final value else: cleanStatus = all(cleanFlagsList) return cleanStatus def setClean(self, wflow): """ A function to set the 'IsClean' flag based on the status from all the pipelines which have worked on the workflow (and have put their markers in the 'PlineMarkers' list) :param wflow: A MSRuleCleaner workflow representation :return: The workflow object """ wflow['IsClean'] = self._checkClean(wflow) return wflow def _checkLogDBClean(self, wflow): """ An auxiliary function used to only check the LogDB cleanup status. It makes a query to LogDB in order to verify there are no any records for the current workflow :param wflow: A MSRuleCleaner workflow representation :return: True if no records were found in LogDB about wflow """ cleanStatus = False logDBRecords = self.logDB.get(wflow['RequestName']) self.logger.debug("logDBRecords: %s", pformat(logDBRecords)) if not logDBRecords: cleanStatus = True return cleanStatus def setLogDBClean(self, wflow): """ A function to set the 'IsLogDBClean' flag based on the presence of any records in LogDB for the current workflow. :param wflow: A MSRuleCleaner workflow representation :return: The workflow object """ wflow['IsLogDBClean'] = self._checkLogDBClean(wflow) if not wflow['IsLogDBClean'] and wflow['IsArchivalDelayExpired']: wflow['IsLogDBClean'] = self._cleanLogDB(wflow) return wflow def _cleanLogDB(self, wflow): """ A function to be used for cleaning all the records related to a workflow in logDB. :param wflow: A MSRuleCleaner workflow representation :return: True if NO errors were encountered while deleting records from LogDB """ cleanStatus = False try: if self.msConfig['enableRealMode']: self.logger.info("Deleting %s records from LogDB WMStats...", wflow['RequestName']) res = self.logDB.delete(wflow['RequestName'], agent=False) if res == 'delete-error': msg = "Failed to delete logDB docs for wflow: %s" % wflow['RequestName'] raise MSRuleCleanerArchivalError(msg) cleanStatus = True else: self.logger.info("DRY-RUN: NOT Deleting %s records from LogDB WMStats...", wflow['RequestName']) except Exception as ex: msg = "General Exception while cleaning LogDB records for wflow: %s : %s" self.logger.exception(msg, wflow['RequestName'], str(ex)) return cleanStatus def findTargetStatus(self, wflow): """ Find the proper targeted archival status :param wflow: A MSRuleCleaner workflow representation :return: The workflow object """ # Check the available status transitions before we decide the final status targetStatusList = RequestStatus.REQUEST_STATE_TRANSITION.get(wflow['RequestStatus'], []) for status in targetStatusList: if self.targetStatusRegex.match(status): wflow['TargetStatus'] = status self.logger.debug("TargetStatus: %s", wflow['TargetStatus']) return wflow def _checkArchDelayExpired(self, wflow): """ A function to check Archival Expiration Delay based on the information returned by WMStatsServer regarding the time of the last request status transition :param wflow: MSRuleCleaner workflow representation :return: True if the archival delay have been expired """ archDelayExpired = False currentTime = int(time.time()) threshold = self.msConfig['archiveDelayHours'] * 3600 try: lastTransitionTime = wflow['RequestTransition'][-1]['UpdateTime'] if lastTransitionTime and (currentTime - lastTransitionTime) > threshold: archDelayExpired = True except KeyError: self.logger.debug("Could not find status transition history for %s", wflow['RequestName']) return archDelayExpired def setArchivalDelayExpired(self, wflow): """ A function to set the 'IsArchivalDelayExpired' flag """ wflow['IsArchivalDelayExpired'] = self._checkArchDelayExpired(wflow) return wflow def archive(self, wflow): """ Move the workflow to the proper archived status after checking the full cleanup status :param wflow: A MSRuleCleaner workflow representation :return: The workflow object """ # Make all the needed checks before trying to archive if not (wflow['IsClean'] or wflow['ForceArchive']): msg = "Not properly cleaned workflow: %s" % wflow['RequestName'] raise MSRuleCleanerArchivalSkip(msg) if not wflow['TargetStatus']: msg = "Could not determine which archival status to target for workflow: %s" % wflow['RequestName'] raise MSRuleCleanerArchivalError(msg) if not wflow['IsLogDBClean']: msg = "LogDB records have not been cleaned for workflow: %s" % wflow['RequestName'] raise MSRuleCleanerArchivalSkip(msg) if not wflow['IsArchivalDelayExpired']: msg = "Archival delay period has not yet expired for workflow: %s." % wflow['RequestName'] raise MSRuleCleanerArchivalSkip(msg) if not self.msConfig['enableRealMode']: msg = "Real Run Mode not enabled." raise MSRuleCleanerArchivalSkip(msg) # Proceed with the actual archival: try: self.reqmgr2.updateRequestStatus(wflow['RequestName'], wflow['TargetStatus']) msg = "Successful status transition to: %s for workflow: %s" self.logger.info(msg, wflow['TargetStatus'], wflow['RequestName']) except Exception as ex: msg = "General Exception while trying status transition to: %s " % wflow['TargetStatus'] msg += "for workflow: %s : %s" % (wflow['RequestName'], str(ex)) raise MSRuleCleanerArchivalError(msg) return wflow def getMSOutputTransferInfo(self, wflow): """ Fetches the transfer information from the MSOutput REST interface for the given workflow. :param wflow: A MSRuleCleaner workflow representation :return: The workflow object """ headers = {'Accept': 'application/json'} params = {} url = '%s/data/info?request=%s' % (self.msConfig['msOutputUrl'], wflow['RequestName']) try: res = self.curlMgr.getdata(url, params=params, headers=headers, ckey=ckey(), cert=cert()) data = json.loads(res)['result'][0] transferInfo = data['transferDoc'] except Exception as ex: msg = "General exception while fetching TransferInfo from MSOutput for %s. " msg += "Error: %s" self.logger.exception(msg, wflow['RequestName'], str(ex)) # Set Transfer status - information fetched from MSOutput only if transferInfo is not None and transferInfo['TransferStatus'] == 'done': wflow['TransferDone'] = True # Set Tape rules status - information fetched from Rucio (tape rule ids from MSOutput) if transferInfo is not None and transferInfo['OutputMap']: tapeRulesStatusList = [] # For setting 'TransferTape' = True we require either no tape rules for the # workflow have been created or all existing tape rules to be in status 'OK', # so every empty TapeRuleID we consider as completed. for mapRecord in transferInfo['OutputMap']: if not mapRecord['TapeRuleID']: continue rucioRule = self.rucio.getRule(mapRecord['TapeRuleID']) if not rucioRule: tapeRulesStatusList.append(False) msg = "Tape rule: %s not found for workflow: %s " msg += "Possible server side error." self.logger.error(msg, mapRecord['TapeRuleID'], wflow['RequestName']) continue if rucioRule['state'] == 'OK': tapeRulesStatusList.append(True) msg = "Tape rule: %s in final state: %s for workflow: %s" self.logger.info(msg, mapRecord['TapeRuleID'], rucioRule['state'], wflow['RequestName']) else: tapeRulesStatusList.append(False) msg = "Tape rule: %s in non final state: %s for workflow: %s" self.logger.info(msg, mapRecord['TapeRuleID'], rucioRule['state'], wflow['RequestName']) if all(tapeRulesStatusList): wflow['TransferTape'] = True return wflow def setParentDatasets(self, wflow): """ Used to resolve parent datasets for a workflow. :param wflow: A MSRuleCleaner workflow representation :return: The workflow object """ if wflow['InputDataset'] and wflow['IncludeParents']: childDataset = wflow['InputDataset'] parentDataset = findParent([childDataset], self.msConfig['dbsUrl']) # NOTE: If findParent() returned None then the DBS service failed to # resolve the request (it is considered an ERROR outside WMCore) if parentDataset.get(childDataset, None) is None: msg = "Failed to resolve parent dataset for: %s in workflow: %s" % (childDataset, wflow['RequestName']) raise MSRuleCleanerResolveParentError(msg) elif parentDataset: wflow['ParentDataset'] = [parentDataset[childDataset]] msg = "Found parent %s for input dataset %s in workflow: %s " self.logger.info(msg, parentDataset, wflow['InputDataset'], wflow['RequestName']) else: msg = "Could not find parent for input dataset: %s in workflows: %s" self.logger.error(msg, wflow['InputDataset'], wflow['RequestName']) return wflow def getRucioRules(self, wflow, gran, rucioAcct): """ Queries Rucio and builds the relevant list of blocklevel rules for the given workflow :param wflow: A MSRuleCleaner workflow representation :param gran: Data granularity to search for Rucio rules. Possible values: 'block' or 'container' :return: The workflow object """ currPline = wflow['PlineMarkers'][-1] # Create the container list to the rucio account map and set the checkGlobalLocks flag. mapRuleType = {self.msConfig['rucioWmaAccount']: ["OutputDatasets"], self.msConfig['rucioMStrAccount']: ["InputDataset", "MCPileup", "DataPileup", "ParentDataset"]} if rucioAcct == self.msConfig['rucioMStrAccount']: checkGlobalLocks = True else: checkGlobalLocks = False # Find all the data placement rules created by the components: for dataType in mapRuleType[rucioAcct]: dataList = wflow[dataType] if isinstance(wflow[dataType], list) else [wflow[dataType]] for dataCont in dataList: if dataCont is None: continue self.logger.debug("getRucioRules: dataCont: %s", pformat(dataCont)) if checkGlobalLocks and dataCont in self.globalLocks: msg = "Found dataset: %s in GlobalLocks. NOT considering it for filling the " msg += "RulesToClean list for both container and block level Rules for workflow: %s!" self.logger.info(msg, dataCont, wflow['RequestName']) continue if gran == 'container': for rule in self.rucio.listDataRules(dataCont, account=rucioAcct): wflow['RulesToClean'][currPline].append(rule['id']) msg = "Found %s container-level rule to be deleted for container %s" self.logger.info(msg, rule['id'], dataCont) elif gran == 'block': try: blocks = self.rucio.getBlocksInContainer(dataCont) for block in blocks: for rule in self.rucio.listDataRules(block, account=rucioAcct): wflow['RulesToClean'][currPline].append(rule['id']) msg = "Found %s block-level rule to be deleted for container %s" self.logger.info(msg, rule['id'], dataCont) except WMRucioDIDNotFoundException: msg = "Container: %s not found in Rucio for workflow: %s." self.logger.info(msg, dataCont, wflow['RequestName']) return wflow def cleanRucioRules(self, wflow): """ Cleans all the Rules present in the field 'RulesToClean' in the MSRuleCleaner workflow representation. And fills the relevant Cleanup Status. :param wflow: A MSRuleCleaner workflow representation :return: The workflow object """ # NOTE: The function should be called independently and sequentially from # The Input and the respective BlockLevel pipelines. # NOTE: The current functional pipeline is always the last one in the PlineMarkers list currPline = wflow['PlineMarkers'][-1] delResults = [] if self.msConfig['enableRealMode']: for rule in wflow['RulesToClean'][currPline]: self.logger.info("%s: Deleting ruleId: %s ", currPline, rule) delResult = self.rucio.deleteRule(rule) delResults.append(delResult) if not delResult: self.logger.warning("%s: Failed to delete ruleId: %s ", currPline, rule) else: for rule in wflow['RulesToClean'][currPline]: delResults.append(True) self.logger.info("%s: DRY-RUN: Is about to delete ruleId: %s ", currPline, rule) # Set the cleanup flag: wflow['CleanupStatus'][currPline] = all(delResults) return wflow def getRequestRecords(self, reqStatus): """ Queries ReqMgr2 for requests in a given status. :param reqStatus: The status for the requests to be fetched from ReqMgr2 :return requests: A dictionary with all the workflows in the given status """ self.logger.info("Fetching requests in status: %s", reqStatus) result = self.reqmgr2.getRequestByStatus([reqStatus], detail=True) if not result: requests = {} else: requests = result[0] self.logger.info(' retrieved %s requests in status: %s', len(requests), reqStatus) return requests
def msOutputProducer(self, requestRecords): """ A top level function to drive the upload of all the documents to MongoDB """ # DONE: # To implement this as a functional pipeline in the following sequence: # 1) document streamer - to generate all the records coming from Reqmgr2 # 2) document stripper - to cut all the cut all the kews we do not need # Mongodb document creator - to pass it through the MongoDBTemplate # 3) document updater - fetch & update all the needed info like campaign config etc. # 4) MongoDB upload/update - to upload/update the document in Mongodb # DONE: # to have the requestRecords generated through a call to docStreamer # and the call should happen from inside this function so that all # the Objects generated do not leave the scope of this function and # with that to reduce big memory footprint # DONE: # to set a destructive function at the end of the pipeline # NOTE: # To discuss the collection names # NOTE: # Here we should never use docUploader with `update=True`, because # this will erase the latest state of already existing and fully or # partially processed documents by the Consumer pipeline self.logger.info("Running the msOutputProducer ...") msPipelineRelVal = Pipeline(name="MSOutputProducer PipelineRelVal", funcLine=[ Functor(self.docTransformer), Functor(self.docKeyUpdate, isRelVal=True), Functor(self.docInfoUpdate, pipeLine='PipelineRelVal'), Functor(self.docUploader, self.msOutRelValColl), Functor(self.docCleaner) ]) msPipelineNonRelVal = Pipeline( name="MSOutputProducer PipelineNonRelVal", funcLine=[ Functor(self.docTransformer), Functor(self.docKeyUpdate, isRelVal=False), Functor(self.docInfoUpdate, pipeLine='PipelineNonRelVal'), Functor(self.docUploader, self.msOutNonRelValColl), Functor(self.docCleaner) ]) # TODO: # To generate the object from within the Function scope see above. counter = 0 for _, request in requestRecords: counter += 1 try: if request.get('SubRequestType') == 'RelVal': pipeLine = msPipelineRelVal pipeLineName = pipeLine.getPipelineName() pipeLine.run(request) else: pipeLine = msPipelineNonRelVal pipeLineName = pipeLine.getPipelineName() pipeLine.run(request) except KeyError as ex: msg = "%s Possibly broken read from Reqmgr2 API or other Err: %s. " % ( pipeLineName, str(ex)) msg += "Continue to the next document." self.logger.exception(msg) continue except TypeError as ex: msg = "%s Possibly broken read from Reqmgr2 API or other Err: %s. " % ( pipeLineName, str(ex)) msg += "Continue to the next document." self.logger.exception(msg) continue except Exception as ex: msg = "%s General Error from pipeline. Err: %s. " % ( pipeLineName, str(ex)) msg += "Giving up Now." self.logger.error(msg) self.logger.exception(ex) break return counter
def msOutputConsumer(self): """ A top level function to drive the creation and book keeping of all the subscriptions to the Data Management System """ # DONE: # Done: To check if the 'enableDataPlacement' flag is really taken into account # Done: To make this for both relvals and non relvals # Done: To return the result # Done: To make report document # Done: To build it through a pipe # Done: To write back the updated document to MonogoDB msPipelineRelVal = Pipeline(name="MSOutputConsumer PipelineRelVal", funcLine=[ Functor(self.docReadfromMongo, self.msOutRelValColl, setTaken=False), Functor(self.makeSubscriptions), Functor(self.docKeyUpdate, isTaken=False, isTakenBy=None, lastUpdate=int(time())), Functor(self.docUploader, self.msOutRelValColl, update=True, keys=[ 'isTaken', 'lastUpdate', 'transferStatus', 'transferIDs' ]), Functor(self.docDump, pipeLine='PipelineRelVal'), Functor(self.docCleaner) ]) msPipelineNonRelVal = Pipeline( name="MSOutputConsumer PipelineNonRelVal", funcLine=[ Functor(self.docReadfromMongo, self.msOutNonRelValColl, setTaken=False), Functor(self.makeSubscriptions), Functor(self.docKeyUpdate, isTaken=False, isTakenBy=None, lastUpdate=int(time())), Functor(self.docUploader, self.msOutNonRelValColl, update=True, keys=[ 'isTaken', 'lastUpdate', 'transferStatus', 'transferIDs' ]), Functor(self.docDump, pipeLine='PipelineNonRelVal'), Functor(self.docCleaner) ]) # NOTE: # If we actually have any exception that has reached to the top level # exception handlers (eg. here - outside the pipeLine), this means # some function from within the pipeLine has not caught it and the msOutDoc # has left the pipe and died before the relevant document in MongoDB # has been released (its flag 'isTaken' to be set back to False) wfCounters = {} for pipeLine in [msPipelineRelVal, msPipelineNonRelVal]: pipeLineName = pipeLine.getPipelineName() wfCounters[pipeLineName] = 0 while wfCounters[pipeLineName] < self.msConfig[ 'limitRequestsPerCycle']: # take only workflows: # - which are not already taken or # - a transfer subscription have never been done for them and # - avoid retrying workflows in the same cycle # NOTE: # Once we are running the service not in a dry run mode we may # consider adding and $or condition in mQueryDict for transferStatus: # '$or': [{'transferStatus': None}, # {'transferStatus': 'incomplete'}] # So that we can collect also workflows with partially or fully # unsuccessful transfers currTime = int(time()) treshTime = currTime - self.msConfig['interval'] mQueryDict = { '$and': [{ 'isTaken': False }, { '$or': [{ 'transferStatus': None }, { 'transferStatus': 'incomplete' }] }, { '$or': [{ 'lastUpdate': None }, { 'lastUpdate': { '$lt': treshTime } }] }] } # FIXME: # To redefine those exceptions as MSoutputExceptions and # start using those here so we do not mix with general errors try: pipeLine.run(mQueryDict) except KeyError as ex: msg = "%s Possibly malformed record in MongoDB. Err: %s. " % ( pipeLineName, str(ex)) msg += "Continue to the next document." self.logger.exception(msg) continue except TypeError as ex: msg = "%s Possibly malformed record in MongoDB. Err: %s. " % ( pipeLineName, str(ex)) msg += "Continue to the next document." self.logger.exception(msg) continue except EmptyResultError as ex: msg = "%s All relevant records in MongoDB exhausted. " % pipeLineName msg += "We are done for the current cycle." self.logger.info(msg) break except Exception as ex: msg = "%s General Error from pipeline. Err: %s. " % ( pipeLineName, str(ex)) msg += "Giving up Now." self.logger.error(msg) self.logger.exception(ex) break wfCounters[pipeLineName] += 1 wfCounterTotal = sum(wfCounters.values()) return wfCounterTotal
def msOutputConsumer(self): """ A top level function to drive the creation and book keeping of all the subscriptions to the Data Management System """ # DONE: # Done: To check if the 'enableDataPlacement' flag is really taken into account # Done: To make this for both relvals and non relvals # Done: To return the result # Done: To make report document # Done: To build it through a pipe # Done: To write back the updated document to MonogoDB msPipelineRelVal = Pipeline( name="MSOutputConsumer PipelineRelVal", funcLine=[ Functor(self.makeSubscriptions), Functor(self.makeTapeSubscriptions), Functor(self.docUploader, update=True, keys=['LastUpdate', 'TransferStatus', 'OutputMap']), Functor(self.docDump, pipeLine='PipelineRelVal'), Functor(self.docCleaner) ]) msPipelineNonRelVal = Pipeline( name="MSOutputConsumer PipelineNonRelVal", funcLine=[ Functor(self.makeSubscriptions), Functor(self.makeTapeSubscriptions), Functor(self.docUploader, update=True, keys=['LastUpdate', 'TransferStatus', 'OutputMap']), Functor(self.docDump, pipeLine='PipelineNonRelVal'), Functor(self.docCleaner) ]) wfCounterTotal = 0 mQueryDict = {'TransferStatus': 'pending'} pipeCollections = [(msPipelineRelVal, self.msOutRelValColl), (msPipelineNonRelVal, self.msOutNonRelValColl)] for pipeColl in pipeCollections: wfCounters = 0 pipeLine = pipeColl[0] dbColl = pipeColl[1] pipeLineName = pipeLine.getPipelineName() for docOut in self.getDocsFromMongo( mQueryDict, dbColl, self.msConfig['limitRequestsPerCycle']): # FIXME: # To redefine those exceptions as MSoutputExceptions and # start using those here so we do not mix with general errors try: # If it's in MongoDB, it can get into our in-memory cache self.requestNamesCached.append(docOut['RequestName']) pipeLine.run(docOut) except (KeyError, TypeError) as ex: msg = "%s Possibly malformed record in MongoDB. Err: %s. " % ( pipeLineName, str(ex)) msg += "Continue to the next document." self.logger.exception(msg) continue except EmptyResultError as ex: msg = "%s All relevant records in MongoDB exhausted. " % pipeLineName msg += "We are done for the current cycle." self.logger.info(msg) break except Exception as ex: msg = "%s General error from pipeline. Err: %s. " % ( pipeLineName, str(ex)) msg += "Will retry again in the next cycle." self.logger.exception(msg) break wfCounters += 1 self.logger.info("Processed %d workflows from pipeline: %s", wfCounters, pipeLineName) wfCounterTotal += wfCounters return wfCounterTotal
class MSUnmerged(MSCore): """ MSUnmerged.py class provides the logic for cleaning the unmerged area of the CMS LFN Namespace. """ def __init__(self, msConfig, logger=None): """ Runs the basic setup and initialization for the MSUnmerged module :param msConfig: micro service configuration """ super(MSUnmerged, self).__init__(msConfig, logger=logger) self.msConfig.setdefault("verbose", True) self.msConfig.setdefault("interval", 60) # self.msConfig.setdefault('limitRSEsPerInstance', 100) # self.msConfig.setdefault('limitTiersPerInstance', ['T1', 'T2', 'T3']) # self.msConfig.setdefault("rucioAccount", "FIXME_RUCIO_ACCT") self.msConfig.setdefault("rseExpr", "*") # TODO: Add 'alertManagerUrl' to msConfig' # self.alertServiceName = "ms-unmerged" # self.alertManagerAPI = AlertManagerAPI(self.msConfig.get("alertManagerUrl", None), logger=logger) # Building all the Pipelines: pName = 'plineUnmerged' self.plineUnmerged = Pipeline(name=pName, funcLine=[Functor(self.cleanFiles)]) # Initialization of the deleted files counters: self.rseCounters = {} self.plineCounters = {} def execute(self): """ Executes the whole MSUnmerged logic :return: summary """ # start threads in MSManager which should call this method summary = dict(UNMERGED_REPORT) self.resetCounters() try: rseList = self.getRSEList() self.updateReportDict(summary, "total_num_rses", len(rseList)) msg = " retrieved %s RSEs. " % len(rseList) msg += "Service set to process up to %s RSEs per instance." % self.msConfig[ "limitRSEsPerInstance"] self.logger.info(msg) except Exception as err: # general error msg = "Unknown exception while trying to estimate the final RSEs to work on. Error: %s", str( err) self.logger.exception(msg) self.updateReportDict(summary, "error", msg) try: totalNumRses, totalNumFiles, numRsesCleaned, numFilesDeleted = self._execute( rseList) msg = "\nTotal number of processed RSEs: %s." msg += "\nTotal number of files to be deleted: %s." msg += "\nNumber of RSEs cleaned: %s." msg += "\nNumber of files deleted: %s." self.logger.info(msg, totalNumRses, totalNumFiles, numRsesCleaned, numFilesDeleted) self.updateReportDict(summary, "total_num_rses", totalNumRses) self.updateReportDict(summary, "total_num_files", totalNumFiles) self.updateReportDict(summary, "num_rses_cleaned", numRsesCleaned) self.updateReportDict(summary, "num_files_deleted", numFilesDeleted) except Exception as ex: msg = "Unknown exception while running MSUnmerged thread Error: %s" self.logger.exception(msg, str(ex)) self.updateReportDict(summary, "error", msg) return summary def _execute(self, rseList): """ Executes the MSUnmerged pipelines :param rseList: A list of RSEs to work on :return: a tuple with: total number of RSEs total number of files found for deletion number of RSEs cleaned number of files deleted """ totalNumRses = 0 totalNumFiles = 0 numRsesCleaned = 0 numFilesDeleted = 0 # Call the workflow dispatcher: for rse in rseList: try: rse = MSUnmergedRSE(rse) self.plineUnmerged.run(rse) msg = "\n----------------------------------------------------------" msg += "\nMSUnmergedRSE: %s" msg += "\n----------------------------------------------------------" self.logger.debug(msg, pformat(rse)) totalNumRses += 1 if rse['isClean']: numRsesCleaned += 1 totalNumFiles += rse['counters']['totalNumFiles'] numFilesDeleted += rse['counters']['numFilesDeleted'] except Exception as ex: msg = "%s: General error from pipeline. RSE: %s. Error: \n%s. " msg += "\nWill retry again in the next cycle." self.logger.exception(msg, self.plineUnmerged.name, rse['rse'], str(ex)) continue return totalNumRses, totalNumFiles, numRsesCleaned, numFilesDeleted def cleanFiles(self, rse): """ The method to implement the actual deletion of files for an RSE. :param rse: MSUnmergedRSE object to be cleaned :return: The MSUnmergedRSE object """ try: # for fileUnmerged in rse['files']['toDelete']: # try: # self.gfalCommand(rse['delInterface'], fileUnmerged) # rse['counters']['numFilesDeleted'] += 1 # rse['files']['deletedSuccess'].append(fileUnmerged) # except Exception as ex: # rse['files']['deletedFail'].append(fileUnmerged) # msg = "Error while trying to delete file: %s for RSE: %s" # msg += "Will retry in the next cycle. Err: %s" # self.logger.debug(msg, fileUnmerged, rse['name'], str(ex)) rse['isClean'] = self._checkClean(rse) except Exception as ex: msg = "Error while cleaning RSE: %s" msg += "Will retry in the next cycle. Err: %s" self.logger.debug(msg, rse['name'], str(ex)) return rse def _checkClean(self, rse): """ A simple function to check if every file in an RSE's unmerged area have been deleted :param rse: The RSE to be checked :return: Bool: True if all files found have been deleted, False otherwise """ return rse['counters']['totalNumFiles'] == rse['counters'][ 'numFilesDeleted'] def resetCounters(self): """ A simple function for zeroing the deleted files counters. """ for rse in self.rseCounters: self.rseCounters[rse]['deletedSuccess'] = 0 self.rseCounters[rse]['deletedFail'] = 0 for pline in self.plineCounters: self.plineCounters[pline.name]['deletedSuccess'] = 0 self.plineCounters[pline.name]['deletedFail'] = 0 def getRSEList(self): """ Queries Rucio for the proper RSE list to iterate through. :return: List of RSE names. """ try: rseList = self.rucio.evaluateRSEExpression( self.msConfig['rseExpr']) except Exception as ex: msg = "Unknown exception while trying to fetch the initial list of RSEs to work on. Err: %s" self.logger.exception(msg, str(ex)) return rseList