Esempio n. 1
0
    def msOutputProducer(self, requestRecords):
        """
        A top level function to fetch requests from ReqMgr2, and produce the correspondent
        records for MSOutput in MongoDB.
        :param requestRecords: list of request dictionaries retrieved from ReqMgr2

        It's implemented as a pipeline, performing the following sequential actions:
           1) document transformer - creates a MSOutputTemplate object from the request dict
           2) document info updater - parses the MSOutputTemplate object and updates the
              necessary data structure mapping output/locations/campaign/etc
           3) document uploader - inserts the MSOutputTemplate object into the correct
              MongoDB collection (ReVal is separated from standard workflows)
           4) document cleaner - releases memory reference to the MSOutputTemplate object
        """
        # DONE:
        #    to set a destructive function at the end of the pipeline
        # NOTE:
        #    To discuss the collection names
        # NOTE:
        #    Here we should never use docUploader with `update=True`, because
        #    this will erase the latest state of already existing and fully or
        #    partially processed documents by the Consumer pipeline
        self.logger.info("Running the msOutputProducer ...")
        msPipeline = Pipeline(name="MSOutputProducer Pipeline",
                              funcLine=[
                                  Functor(self.docTransformer),
                                  Functor(self.docInfoUpdate),
                                  Functor(self.docUploader),
                                  Functor(self.docCleaner)
                              ])
        # TODO:
        #    To generate the object from within the Function scope see above.
        counter = 0
        for request in viewvalues(requestRecords):
            if request['RequestName'] in self.requestNamesCached:
                # if it's cached, then it's already in MongoDB, no need to redo this thing!
                continue
            counter += 1
            try:
                pipeLineName = msPipeline.getPipelineName()
                msPipeline.run(request)
            except (KeyError, TypeError) as ex:
                msg = "%s Possibly broken read from ReqMgr2 API or other. Err: %s." % (
                    pipeLineName, str(ex))
                msg += " Continue to the next document."
                self.logger.exception(msg)
                continue
            except Exception as ex:
                msg = "%s General Error from pipeline. Err: %s. " % (
                    pipeLineName, str(ex))
                msg += "Giving up Now."
                self.logger.exception(str(ex))
                break
        return counter
Esempio n. 2
0
    def __init__(self, msConfig, logger=None):
        """
        Runs the basic setup and initialization for the MSUnmerged module
        :param msConfig: micro service configuration
        """
        super(MSUnmerged, self).__init__(msConfig, logger=logger)

        self.msConfig.setdefault("verbose", True)
        self.msConfig.setdefault("interval", 60)
        self.msConfig.setdefault("limitFilesPerRSE", 200)
        self.msConfig.setdefault("skipRSEs", [])
        self.msConfig.setdefault("rseExpr", "*")
        self.msConfig.setdefault("enableRealMode", False)
        self.msConfig.setdefault("dumpRSE", False)
        self.msConfig.setdefault("gfalLogLevel", 'normal')
        self.msConfig.setdefault("dirFilterIncl", [])
        self.msConfig.setdefault("dirFilterExcl", [])
        self.msConfig.setdefault("emulateGfal2", False)
        self.msConfig.setdefault("filesToDeleteSliceSize", 100)
        if self.msConfig['emulateGfal2'] is False and gfal2 is None:
            msg = "Failed to import gfal2 library while it's not "
            msg += "set to emulate it. Crashing the service!"
            raise ImportError(msg)

        # TODO: Add 'alertManagerUrl' to msConfig'
        # self.alertServiceName = "ms-unmerged"
        # self.alertManagerAPI = AlertManagerAPI(self.msConfig.get("alertManagerUrl", None), logger=logger)

        # Instantiating the Rucio Consistency Monitor Client
        self.rucioConMon = RucioConMon(self.msConfig['rucioConMon'], logger=self.logger)

        self.wmstatsSvc = WMStatsServer(self.msConfig['wmstatsUrl'], logger=self.logger)

        # Building all the Pipelines:
        pName = 'plineUnmerged'
        self.plineUnmerged = Pipeline(name=pName,
                                      funcLine=[Functor(self.updateRSETimestamps, start=True, end=False),
                                                Functor(self.consRecordAge),
                                                Functor(self.getUnmergedFiles),
                                                Functor(self.filterUnmergedFiles),
                                                Functor(self.getPfn),
                                                Functor(self.cleanRSE),
                                                Functor(self.updateRSECounters, pName),
                                                Functor(self.updateRSETimestamps, start=False, end=True),
                                                Functor(self.purgeRseObj, dumpRSE=self.msConfig['dumpRSE'])])
        # Initialization of the deleted files counters:
        self.rseCounters = {}
        self.plineCounters = {}
        self.rseTimestamps = {}
        self.rseConsStats = {}
        self.protectedLFNs = []

        # The basic /store/unmerged regular expression:
        self.regStoreUnmergedLfn = re.compile("^/store/unmerged/.*$")
        self.regStoreUnmergedPfn = re.compile("^.+/store/unmerged/.*$")
Esempio n. 3
0
    def __init__(self, msConfig, logger=None):
        """
        Runs the basic setup and initialization for the MSUnmerged module
        :param msConfig: micro service configuration
        """
        super(MSUnmerged, self).__init__(msConfig, logger=logger)

        self.msConfig.setdefault("verbose", True)
        self.msConfig.setdefault("interval", 60)
        # self.msConfig.setdefault('limitRSEsPerInstance', 100)
        # self.msConfig.setdefault('limitTiersPerInstance', ['T1', 'T2', 'T3'])
        # self.msConfig.setdefault("rucioAccount", "FIXME_RUCIO_ACCT")
        self.msConfig.setdefault("rseExpr", "*")
        # TODO: Add 'alertManagerUrl' to msConfig'
        # self.alertServiceName = "ms-unmerged"
        # self.alertManagerAPI = AlertManagerAPI(self.msConfig.get("alertManagerUrl", None), logger=logger)

        # Building all the Pipelines:
        pName = 'plineUnmerged'
        self.plineUnmerged = Pipeline(name=pName,
                                      funcLine=[Functor(self.cleanFiles)])
        # Initialization of the deleted files counters:
        self.rseCounters = {}
        self.plineCounters = {}
Esempio n. 4
0
    def __init__(self, msConfig, logger=None):
        """
        Runs the basic setup and initialization for the MSRuleCleaner module
        :param msConfig: micro service configuration
        """
        super(MSRuleCleaner, self).__init__(msConfig, logger=logger)

        self.msConfig.setdefault("verbose", True)
        self.msConfig.setdefault("interval", 60)
        self.msConfig.setdefault("services", ['ruleCleaner'])
        self.msConfig.setdefault("rucioWmaAccount", "wma_test")
        self.msConfig.setdefault("rucioMStrAccount", "wmcore_transferor")
        self.msConfig.setdefault('enableRealMode', False)
        self.mode = "RealMode" if self.msConfig[
            'enableRealMode'] else "DryRunMode"
        self.emailAlert = EmailAlert(self.msConfig)
        self.curlMgr = RequestHandler()

        # Building all the Pipelines:
        pName = 'plineMSTrCont'
        self.plineMSTrCont = Pipeline(name=pName,
                                      funcLine=[
                                          Functor(self.setPlineMarker, pName),
                                          Functor(self.cleanRucioRules)
                                      ])
        pName = 'plineMSTrBlock'
        self.plineMSTrBlock = Pipeline(name=pName,
                                       funcLine=[
                                           Functor(self.setPlineMarker, pName),
                                           Functor(self.cleanRucioRules)
                                       ])
        pName = 'plineAgentCont'
        self.plineAgentCont = Pipeline(
            name=pName,
            funcLine=[
                Functor(self.setPlineMarker, pName),
                Functor(self.getRucioRules, 'container',
                        self.msConfig['rucioWmaAccount']),
                Functor(self.cleanRucioRules)
            ])
        pName = 'plineAgentBlock'
        self.plineAgentBlock = Pipeline(
            name=pName,
            funcLine=[
                Functor(self.setPlineMarker, pName),
                Functor(self.getRucioRules, 'block',
                        self.msConfig['rucioWmaAccount']),
                Functor(self.cleanRucioRules)
            ])
        pName = 'plineArchive'
        self.plineArchive = Pipeline(name=pName,
                                     funcLine=[
                                         Functor(self.setPlineMarker, pName),
                                         Functor(self.setClean),
                                         Functor(self.archive)
                                     ])

        # Building the different set of plines we will need later:
        # NOTE: The following are all the functional pipelines which are supposed to include
        #       a cleanup function and report cleanup status in the MSRuleCleanerWflow object
        self.cleanuplines = [
            self.plineMSTrCont, self.plineMSTrBlock, self.plineAgentCont,
            self.plineAgentBlock
        ]
        # Building an auxiliary list of cleanup pipeline names only:
        self.cleanupPipeNames = [pline.name for pline in self.cleanuplines]

        # Building lists of pipelines related only to Agents or MStransferror
        self.agentlines = [self.plineAgentCont, self.plineAgentBlock]
        self.mstrlines = [self.plineMSTrCont, self.plineMSTrBlock]

        # Initialization of the 'cleaned' and 'archived' counters:
        self.wfCounters = {'cleaned': {}, 'archived': 0}
Esempio n. 5
0
class MSRuleCleaner(MSCore):
    """
    MSRuleCleaner.py class provides the logic used to clean the Rucio
    block level data placement rules created by WMAgent.
    """
    def __init__(self, msConfig, logger=None):
        """
        Runs the basic setup and initialization for the MSRuleCleaner module
        :param msConfig: micro service configuration
        """
        super(MSRuleCleaner, self).__init__(msConfig, logger=logger)

        self.msConfig.setdefault("verbose", True)
        self.msConfig.setdefault("interval", 60)
        self.msConfig.setdefault("services", ['ruleCleaner'])
        self.msConfig.setdefault("rucioWmaAccount", "wma_test")
        self.msConfig.setdefault("rucioMStrAccount", "wmcore_transferor")
        self.msConfig.setdefault('enableRealMode', False)
        self.mode = "RealMode" if self.msConfig[
            'enableRealMode'] else "DryRunMode"
        self.emailAlert = EmailAlert(self.msConfig)
        self.curlMgr = RequestHandler()

        # Building all the Pipelines:
        pName = 'plineMSTrCont'
        self.plineMSTrCont = Pipeline(name=pName,
                                      funcLine=[
                                          Functor(self.setPlineMarker, pName),
                                          Functor(self.cleanRucioRules)
                                      ])
        pName = 'plineMSTrBlock'
        self.plineMSTrBlock = Pipeline(name=pName,
                                       funcLine=[
                                           Functor(self.setPlineMarker, pName),
                                           Functor(self.cleanRucioRules)
                                       ])
        pName = 'plineAgentCont'
        self.plineAgentCont = Pipeline(
            name=pName,
            funcLine=[
                Functor(self.setPlineMarker, pName),
                Functor(self.getRucioRules, 'container',
                        self.msConfig['rucioWmaAccount']),
                Functor(self.cleanRucioRules)
            ])
        pName = 'plineAgentBlock'
        self.plineAgentBlock = Pipeline(
            name=pName,
            funcLine=[
                Functor(self.setPlineMarker, pName),
                Functor(self.getRucioRules, 'block',
                        self.msConfig['rucioWmaAccount']),
                Functor(self.cleanRucioRules)
            ])
        pName = 'plineArchive'
        self.plineArchive = Pipeline(name=pName,
                                     funcLine=[
                                         Functor(self.setPlineMarker, pName),
                                         Functor(self.setClean),
                                         Functor(self.archive)
                                     ])

        # Building the different set of plines we will need later:
        # NOTE: The following are all the functional pipelines which are supposed to include
        #       a cleanup function and report cleanup status in the MSRuleCleanerWflow object
        self.cleanuplines = [
            self.plineMSTrCont, self.plineMSTrBlock, self.plineAgentCont,
            self.plineAgentBlock
        ]
        # Building an auxiliary list of cleanup pipeline names only:
        self.cleanupPipeNames = [pline.name for pline in self.cleanuplines]

        # Building lists of pipelines related only to Agents or MStransferror
        self.agentlines = [self.plineAgentCont, self.plineAgentBlock]
        self.mstrlines = [self.plineMSTrCont, self.plineMSTrBlock]

        # Initialization of the 'cleaned' and 'archived' counters:
        self.wfCounters = {'cleaned': {}, 'archived': 0}

    def resetCounters(self):
        """
        A simple function for zeroing the cleaned and archived counters.
        """
        for pline in self.cleanuplines:
            self.wfCounters['cleaned'][pline.name] = 0
        self.wfCounters['archived'] = 0

    def execute(self, reqStatus):
        """
        Executes the whole ruleCleaner logic
        :return: summary
        """
        # start threads in MSManager which should call this method
        summary = dict(RULECLEANER_REPORT)

        self.currThread = current_thread()
        self.currThreadIdent = self.currThread.name
        self.updateReportDict(summary, "thread_id", self.currThreadIdent)
        self.resetCounters()

        self.logger.info("MSRuleCleaner is running in mode: %s.", self.mode)

        # Build the list of workflows to work on:
        try:
            requestRecords = {}
            for status in reqStatus:
                requestRecords.update(self.getRequestRecords(status))
        except Exception as err:  # general error
            msg = "Unknown exception while fetching requests from ReqMgr2. Error: %s", str(
                err)
            self.logger.exception(msg)
            self.updateReportDict(summary, "error", msg)

        # Call _execute() and feed the relevant pipeline with the objects popped from requestRecords
        try:
            totalNumRequests, cleanNumRequests, archivedNumRequests = self._execute(
                requestRecords)
            msg = "\nNumber of processed workflows: %s."
            msg += "\nNumber of properly cleaned workflows: %s."
            msg += "\nNumber of archived workflows: %s."
            self.logger.info(msg, totalNumRequests, cleanNumRequests,
                             archivedNumRequests)
            self.updateReportDict(summary, "total_num_requests",
                                  totalNumRequests)
            self.updateReportDict(summary, "clean_num_requests",
                                  cleanNumRequests)
            self.updateReportDict(summary, "archived_num_requests",
                                  archivedNumRequests)
        except Exception as ex:
            msg = "Unknown exception while running MSRuleCleaner thread Error: %s"
            self.logger.exception(msg, str(ex))
            self.updateReportDict(summary, "error", msg)

        return summary

    def _execute(self, reqRecords):
        """
        Executes the MSRuleCleaner pipelines based on the workflow status
        :param reqList: A list of RequestRecords to work on
        :return:        a tuple with:
                            number of properly cleaned requests
                            number of processed workflows
                            number of archived workflows
        """
        # NOTE: The Input Cleanup, the Block Level Cleanup and the Archival
        #       Pipelines are executed sequentially in the above order.
        #       This way we assure ourselves that we archive only workflows
        #       that have accomplished the needed cleanup

        cleanNumRequests = 0
        totalNumRequests = 0

        # Call the workflow dispatcher:
        for _, req in reqRecords.items():
            wflow = MSRuleCleanerWflow(req)
            self._dispatchWflow(wflow)
            msg = "\n----------------------------------------------------------"
            msg += "\nMSRuleCleanerWflow: %s"
            msg += "\n----------------------------------------------------------"
            self.logger.debug(msg, pformat(wflow))
            totalNumRequests += 1
            if self._checkClean(wflow):
                cleanNumRequests += 1

        # Report the counters:
        for pline in self.cleanuplines:
            msg = "Workflows cleaned by pipeline: %s: %d"
            self.logger.info(msg, pline.name,
                             self.wfCounters['cleaned'][pline.name])
        archivedNumRequests = self.wfCounters['archived']
        self.logger.info("Workflows archived: %d", self.wfCounters['archived'])
        return totalNumRequests, cleanNumRequests, archivedNumRequests

    def _dispatchWflow(self, wflow):
        """
        A function intended to dispatch a workflow (e.g based on its status)
        through one or more functional pipelines in case there is some more
        complicated logic involved in the order we execute them but not just
        a sequentially
        """
        self.logger.debug("Dispatching workflow: %s", wflow['RequestName'])
        # NOTE: The following dispatch logic is a subject to be changed at any time

        # Resolve:
        # NOTE: First resolve any preliminary flags that will be needed further
        #       in the logic of the _dispatcher() itself
        if wflow['RequestStatus'] == 'announced':
            self.getMSOutputTransferInfo(wflow)

        # Clean:
        # Do not clean any Resubmission, but still let them be archived
        if wflow['RequestType'] == 'Resubmission':
            wflow['ForceArchive'] = True
            msg = "Skipping cleanup step for workflow: %s - RequestType is %s."
            msg += " Will try to archive it directly."
            self.logger.info(msg, wflow['RequestName'], wflow['RequestType'])
        elif wflow['RequestStatus'] in ['rejected', 'aborted-completed']:
            # NOTE: We do not check the ParentageResolved flag for these
            #       workflows, but we do need to clean output data placement
            #       rules from the agents for them
            for pline in self.agentlines:
                try:
                    pline.run(wflow)
                except Exception as ex:
                    msg = "%s: General error from pipeline. Workflow: %s. Error: \n%s. "
                    msg += "\nWill retry again in the next cycle."
                    self.logger.exception(msg, pline.name,
                                          wflow['RequestName'], str(ex))
                    continue
                if wflow['CleanupStatus'][pline.name]:
                    self.wfCounters['cleaned'][pline.name] += 1
        elif wflow['RequestStatus'] == 'announced' and not wflow[
                'ParentageResolved']:
            # NOTE: We skip workflows which are not having 'ParentageResolved'
            #       flag, but we still need some proper logging for them.
            msg = "Skipping workflow: %s - 'ParentageResolved' flag set to false."
            msg += " Will retry again in the next cycle."
            self.logger.info(msg, wflow['RequestName'])
        elif wflow[
                'RequestStatus'] == 'announced' and not wflow['TransferDone']:
            # NOTE: We skip workflows which have not yet finalised their TransferStatus
            #       in MSOutput, but we still need some proper logging for them.
            msg = "Skipping workflow: %s - 'TransferStatus' is 'pending' or 'TransferInfo' is missing in MSOutput."
            msg += " Will retry again in the next cycle."
            self.logger.info(msg, wflow['RequestName'])
        elif wflow['RequestStatus'] == 'announced':
            for pline in self.cleanuplines:
                try:
                    pline.run(wflow)
                except Exception as ex:
                    msg = "%s: General error from pipeline. Workflow: %s. Error:  \n%s. "
                    msg += "\nWill retry again in the next cycle."
                    self.logger.exception(msg, pline.name,
                                          wflow['RequestName'], str(ex))
                    continue
                if wflow['CleanupStatus'][pline.name]:
                    self.wfCounters['cleaned'][pline.name] += 1
        else:
            # We shouldn't be here:
            msg = "Skipping workflow: %s - "
            msg += "Does not fall under any of the defined categories."
            self.logger.error(msg, wflow['RequestName'])

        # Archive:
        try:
            self.plineArchive.run(wflow)
            self.wfCounters['archived'] += 1
        except MSRuleCleanerArchival as ex:
            msg = "%s: Archival Error: %s. "
            msg += " Will retry again in the next cycle."
            self.logger.error(msg, wflow['PlineMarkers'][-1], ex.message())
        except Exception as ex:
            msg = "%s General error from pipeline. Workflow: %s. Error: \n%s. "
            msg += "\nWill retry again in the next cycle."
            self.logger.exception(msg, wflow['PlineMarkers'][-1],
                                  wflow['RequestName'], str(ex))

    def setPlineMarker(self, wflow, pName):
        """
        A function intended to mark which is the pipeline currently working
        on the workflow. It is supposed to be called always as a first function
        in the pipeline.
        :param  wflow:   A MSRuleCleaner workflow representation
        :param  pName:   The name of the functional pipeline
        :return wflow:
        """
        # NOTE: The current functional pipeline MUST always be appended at the
        #       end of the 'PlineMarkers' list

        # First get rid of the default:
        if not wflow['PlineMarkers']:
            wflow['PlineMarkers'] = []

        # Then push our current value into the markers list:
        wflow['PlineMarkers'].append(pName)

        # Populate the list of flags to be used later:
        if pName not in wflow['RulesToClean']:
            if pName in self.cleanupPipeNames:
                wflow['RulesToClean'][pName] = []
        if pName not in wflow['CleanupStatus']:
            if pName in self.cleanupPipeNames:
                wflow['CleanupStatus'][pName] = False
        return wflow

    def _checkClean(self, wflow):
        """
        An auxiliary function used to only check the temporary cleanup status.
        It basically takes the pipelines registered in 'PlineMarkers' that have
        already worked on the workflow as a mask and applies this mask over
        the set of flags in the 'CleanupStatus' field and then reduces the
        result to a single bool value
        """
        # NOTE: This is one of the few functions taking a workflow as an argument
        #       but returning a bool, since it is an auxiliary function and is not
        #       supposed to be called as a standalone function in a pipeline.
        # NOTE: `all([]) == True`, ergo all the 'rejected' && 'aborted-completed' workflows
        #       are also counted as properly cleaned and can trigger archival later

        # Build a list of bool flags based on the mask of PlineMarkers
        cleanFlagsList = [
            wflow['CleanupStatus'][key] for key in wflow['PlineMarkers']
            if key in wflow['CleanupStatus'].keys()
        ]

        # If no one have worked on the workflow set the clean status to false
        if not wflow['PlineMarkers']:
            cleanStatus = False
        # If we have a mask longer than the list of flags avoid false positives
        # because of the behavior explained above - `all([]) == True`
        elif not cleanFlagsList:
            cleanStatus = False
        # Figure out the final value
        else:
            cleanStatus = all(cleanFlagsList)
        return cleanStatus

    def setClean(self, wflow):
        """
        A function to set the 'IsClean' flag based on the status from all the
        pipelines which have worked on the workflow (and have put their markers
        in the 'PlineMarkers' list)
        :param  wflow:      A MSRuleCleaner workflow representation
        :return wflow:
        """
        wflow['IsClean'] = self._checkClean(wflow)
        return wflow

    def archive(self, wflow):
        """
        Move the workflow to the proper archived status after checking
        the full cleanup status
        :param  wflow:      A MSRuleCleaner workflow representation
        :param  archStatus: Target status to transition after archival
        :return wflow:
        """
        # NOTE: check allowed status transitions with:
        #       https://github.com/dmwm/WMCore/blob/5961d2229b1e548e58259c06af154f33bce36c68/src/python/WMCore/ReqMgr/DataStructs/RequestStatus.py#L171
        if not (wflow['IsClean'] or wflow['ForceArchive']):
            msg = "Not properly cleaned workflow: %s" % wflow['RequestName']
            raise MSRuleCleanerArchival(msg)

        # Check the available status transitions before we decide the final status
        targetStatusList = RequestStatus.REQUEST_STATE_TRANSITION.get(
            wflow['RequestStatus'], [])
        self.logger.info("targetStatusList: %s", targetStatusList)
        return wflow

    def getMSOutputTransferInfo(self, wflow):
        """
        Fetches the transfer information from the MSOutput REST interface for
        the given workflow.
        :param  wflow:   A MSRuleCleaner workflow representation
        :return wflow:
        """
        headers = {'Accept': 'application/json'}
        params = {}
        url = '%s/data/info?request=%s' % (self.msConfig['msOutputUrl'],
                                           wflow['RequestName'])
        try:
            res = self.curlMgr.getdata(url,
                                       params=params,
                                       headers=headers,
                                       ckey=ckey(),
                                       cert=cert())
            data = json.loads(res)['result'][0]
            transferInfo = data['transferDoc']
        except Exception as ex:
            msg = "General exception while fetching TransferInfo from MSOutput for %s. "
            msg += "Error: %s"
            self.logger.exception(msg, wflow['RequestName'], str(ex))
        if transferInfo is not None and transferInfo[
                'TransferStatus'] == 'done':
            wflow['TransferDone'] = True
        return wflow

    def getRucioRules(self, wflow, gran, rucioAcct):
        """
        Queries Rucio and builds the relevant list of blocklevel rules for
        the given workflow
        :param  wflow:   A MSRuleCleaner workflow representation
        :param  gran:    Data granularity to search for Rucio rules. Possible values:
                        'block' || 'container'
        :return:        wflow
        """
        currPline = wflow['PlineMarkers'][-1]
        # Find all the output placement rules created by the agents
        for dataCont in wflow['OutputDatasets']:
            if gran == 'container':
                for rule in self.rucio.listDataRules(dataCont,
                                                     account=rucioAcct):
                    wflow['RulesToClean'][currPline].append(rule['id'])
            elif gran == 'block':
                try:
                    blocks = self.rucio.getBlocksInContainer(dataCont)
                    for block in blocks:
                        for rule in self.rucio.listDataRules(
                                block, account=rucioAcct):
                            wflow['RulesToClean'][currPline].append(rule['id'])
                except WMRucioDIDNotFoundException:
                    msg = "Container: %s not found in Rucio for workflow: %s."
                    self.logger.info(msg, dataCont, wflow['RequestName'])
        return wflow

    def cleanRucioRules(self, wflow):
        """
        Cleans all the Rules present in the field 'RulesToClean' in the MSRuleCleaner
        workflow representation. And fills the relevant Cleanup Status.
        :param wflow:   A MSRuleCleaner workflow representation
        :return:        wflow
        """
        # NOTE: The function should be called independently and sequentially from
        #       The Input and the respective BlockLevel pipelines.

        # NOTE: The current functional pipeline is always the last one in the PlineMarkers list
        currPline = wflow['PlineMarkers'][-1]
        delResults = []
        if self.msConfig['enableRealMode']:
            for rule in wflow['RulesToClean'][currPline]:
                self.logger.info("%s: Deleting ruleId: %s ", currPline, rule)
                delResult = self.rucio.deleteRule(rule)
                delResults.append(delResult)
                if not delResult:
                    self.logger.warning("%s: Failed to delete ruleId: %s ",
                                        currPline, rule)
        else:
            for rule in wflow['RulesToClean'][currPline]:
                delResults.append(True)
                self.logger.info("%s: DRY-RUN: Is about to delete ruleId: %s ",
                                 currPline, rule)

        # Set the cleanup flag:
        wflow['CleanupStatus'][currPline] = all(delResults)
        # ----------------------------------------------------------------------
        # FIXME : To be removed once the plineMSTrBlock && plineMSTrCont are
        #         developed
        if wflow['CleanupStatus'][currPline] in [
                'plineMSTrBlock', 'plineMSTrCont'
        ]:
            wflow['CleanupStatus'][currPline] = True
        # ----------------------------------------------------------------------
        return wflow

    def getRequestRecords(self, reqStatus):
        """
        Queries ReqMgr2 for requests in a given status.
        :param reqStatus: The status for the requests to be fetched from ReqMgr2
        :return requests: A dictionary with all the workflows in the given status
        """
        self.logger.info("Fetching requests in status: %s", reqStatus)
        result = self.reqmgr2.getRequestByStatus([reqStatus], detail=True)
        if not result:
            requests = {}
        else:
            requests = result[0]
        self.logger.info('  retrieved %s requests in status: %s',
                         len(requests), reqStatus)
        return requests
Esempio n. 6
0
class MSRuleCleaner(MSCore):
    """
    MSRuleCleaner.py class provides the logic used to clean the Rucio
    block level data placement rules created by WMAgent.
    """

    def __init__(self, msConfig, logger=None):
        """
        Runs the basic setup and initialization for the MSRuleCleaner module
        :param msConfig: micro service configuration
        """
        super(MSRuleCleaner, self).__init__(msConfig, logger=logger)

        self.msConfig.setdefault("verbose", True)
        self.msConfig.setdefault("interval", 60)
        self.msConfig.setdefault("services", ['ruleCleaner'])
        self.msConfig.setdefault("rucioWmaAccount", "wma_test")
        self.msConfig.setdefault("rucioMStrAccount", "wmcore_transferor")
        self.msConfig.setdefault('enableRealMode', False)

        self.mode = "RealMode" if self.msConfig['enableRealMode'] else "DryRunMode"
        self.curlMgr = RequestHandler()
        self.targetStatusRegex = re.compile(r'.*archived')
        self.logDB = LogDB(self.msConfig["logDBUrl"],
                           self.msConfig["logDBReporter"],
                           logger=self.logger)
        self.wmstatsSvc = WMStatsServer(self.msConfig['wmstatsUrl'], logger=self.logger)

        # Building all the Pipelines:
        pName = 'plineMSTrCont'
        self.plineMSTrCont = Pipeline(name=pName,
                                      funcLine=[Functor(self.setPlineMarker, pName),
                                                Functor(self.setParentDatasets),
                                                Functor(self.getRucioRules, 'container', self.msConfig['rucioMStrAccount']),
                                                Functor(self.cleanRucioRules)])
        pName = 'plineMSTrBlock'
        self.plineMSTrBlock = Pipeline(name=pName,
                                       funcLine=[Functor(self.setPlineMarker, pName),
                                                 Functor(self.setParentDatasets),
                                                 Functor(self.getRucioRules, 'block', self.msConfig['rucioMStrAccount']),
                                                 Functor(self.cleanRucioRules)])
        pName = 'plineAgentCont'
        self.plineAgentCont = Pipeline(name=pName,
                                       funcLine=[Functor(self.setPlineMarker, pName),
                                                 Functor(self.getRucioRules, 'container', self.msConfig['rucioWmaAccount']),
                                                 Functor(self.cleanRucioRules)])
        pName = 'plineAgentBlock'
        self.plineAgentBlock = Pipeline(name=pName,
                                        funcLine=[Functor(self.setPlineMarker, pName),
                                                  Functor(self.getRucioRules, 'block', self.msConfig['rucioWmaAccount']),
                                                  Functor(self.cleanRucioRules)])
        pName = 'plineArchive'
        self.plineArchive = Pipeline(name=pName,
                                     funcLine=[Functor(self.setPlineMarker, pName),
                                               Functor(self.findTargetStatus),
                                               Functor(self.setClean),
                                               Functor(self.setArchivalDelayExpired),
                                               Functor(self.setLogDBClean),
                                               Functor(self.archive)])

        # Building the different set of plines we will need later:
        # NOTE: The following are all the functional pipelines which are supposed to include
        #       a cleanup function and report cleanup status in the MSRuleCleanerWflow object
        self.cleanuplines = [self.plineMSTrCont,
                             self.plineMSTrBlock,
                             self.plineAgentCont,
                             self.plineAgentBlock]
        # Building an auxiliary list of cleanup pipeline names only:
        self.cleanupPipeNames = [pline.name for pline in self.cleanuplines]

        # Building lists of pipelines related only to Agents or MStransferror
        self.agentlines = [self.plineAgentCont,
                           self.plineAgentBlock]
        self.mstrlines = [self.plineMSTrCont,
                          self.plineMSTrBlock]

        # Initialization of the 'cleaned' and 'archived' counters:
        self.wfCounters = {'cleaned': {},
                           'archived': {'normalArchived': 0,
                                        'forceArchived': 0}}
        self.globalLocks = set()

    def getGlobalLocks(self):
        """
        Fetches the list of 'globalLocks' from wmstats server and the list of
        'parentLocks' from request manager. Stores/updates the unified set in
        the 'globalLocks' instance variable. Returns the resultant unified set.
        :return: A union set of the 'globalLocks' and the 'parentLocks' lists
        """
        self.logger.info("Fetching globalLocks list from wmstats server.")
        try:
            globalLocks = set(self.wmstatsSvc.getGlobalLocks())
        except Exception as ex:
            msg = "Failed to refresh global locks list for the current polling cycle. Error: %s "
            msg += "Skipping this polling cycle."
            self.logger.error(msg, str(ex))
            raise ex
        self.logger.info("Fetching parentLocks list from reqmgr2 server.")
        try:
            parentLocks = set(self.reqmgr2.getParentLocks())
        except Exception as ex:
            msg = "Failed to refresh parent locks list for the current poling cycle. Error: %s "
            msg += "Skipping this polling cycle."
            self.logger.error(msg, str(ex))
            raise ex
        self.globalLocks = globalLocks | parentLocks

    def resetCounters(self):
        """
        A simple function for zeroing the cleaned and archived counters.
        """
        for pline in self.cleanuplines:
            self.wfCounters['cleaned'][pline.name] = 0
        self.wfCounters['archived']['normalArchived'] = 0
        self.wfCounters['archived']['forceArchived'] = 0

    def execute(self, reqStatus):
        """
        Executes the whole ruleCleaner logic
        :return: summary
        """
        # start threads in MSManager which should call this method
        summary = dict(RULECLEANER_REPORT)

        self.currThread = current_thread()
        self.currThreadIdent = self.currThread.name
        self.updateReportDict(summary, "thread_id", self.currThreadIdent)
        self.resetCounters()
        self.logger.info("MSRuleCleaner is running in mode: %s.", self.mode)

        # Build the list of workflows to work on:
        try:
            requestRecords = {}
            for status in reqStatus:
                requestRecords.update(self.getRequestRecords(status))
        except Exception as err:  # general error
            msg = "Unknown exception while fetching requests from ReqMgr2. Error: %s", str(err)
            self.logger.exception(msg)
            self.updateReportDict(summary, "error", msg)

        # Call _execute() and feed the relevant pipeline with the objects popped from requestRecords
        try:
            self.getGlobalLocks()
            totalNumRequests, cleanNumRequests, normalArchivedNumRequests, forceArchivedNumRequests = self._execute(requestRecords)
            msg = "\nNumber of processed workflows: %s."
            msg += "\nNumber of properly cleaned workflows: %s."
            msg += "\nNumber of normally archived workflows: %s."
            msg += "\nNumber of force archived workflows: %s."
            self.logger.info(msg,
                             totalNumRequests,
                             cleanNumRequests,
                             normalArchivedNumRequests,
                             forceArchivedNumRequests)
            self.updateReportDict(summary, "total_num_requests", totalNumRequests)
            self.updateReportDict(summary, "clean_num_requests", cleanNumRequests)
            self.updateReportDict(summary, "normal_archived_num_requests", normalArchivedNumRequests)
            self.updateReportDict(summary, "force_archived_num_requests", forceArchivedNumRequests)
        except Exception as ex:
            msg = "Unknown exception while running MSRuleCleaner thread Error: %s"
            self.logger.exception(msg, str(ex))
            self.updateReportDict(summary, "error", msg)

        return summary

    def _execute(self, reqRecords):
        """
        Executes the MSRuleCleaner pipelines based on the workflow status
        :param reqList: A list of RequestRecords to work on
        :return:        a tuple with:
                            number of properly cleaned requests
                            number of processed workflows
                            number of archived workflows
        """
        # NOTE: The Input Cleanup, the Block Level Cleanup and the Archival
        #       Pipelines are executed sequentially in the above order.
        #       This way we assure ourselves that we archive only workflows
        #       that have accomplished the needed cleanup

        cleanNumRequests = 0
        totalNumRequests = 0

        # Call the workflow dispatcher:
        for req in viewvalues(reqRecords):
            wflow = MSRuleCleanerWflow(req)
            self._dispatchWflow(wflow)
            msg = "\n----------------------------------------------------------"
            msg += "\nMSRuleCleanerWflow: %s"
            msg += "\n----------------------------------------------------------"
            self.logger.debug(msg, pformat(wflow))
            totalNumRequests += 1
            if self._checkClean(wflow):
                cleanNumRequests += 1

        # Report the counters:
        for pline in self.cleanuplines:
            msg = "Workflows cleaned by pipeline: %s: %d"
            self.logger.info(msg, pline.name, self.wfCounters['cleaned'][pline.name])
        normalArchivedNumRequests = self.wfCounters['archived']['normalArchived']
        forceArchivedNumRequests = self.wfCounters['archived']['forceArchived']
        self.logger.info("Workflows normally archived: %d", self.wfCounters['archived']['normalArchived'])
        self.logger.info("Workflows force archived: %d", self.wfCounters['archived']['forceArchived'])
        return totalNumRequests, cleanNumRequests, normalArchivedNumRequests, forceArchivedNumRequests

    def _dispatchWflow(self, wflow):
        """
        A function intended to dispatch a workflow (e.g based on its status)
        through one or more functional pipelines in case there is some more
        complicated logic involved in the order we execute them but not just
        a sequentially
        """
        self.logger.debug("Dispatching workflow: %s", wflow['RequestName'])
        # NOTE: The following dispatch logic is a subject to be changed at any time

        # Resolve:
        # NOTE: First resolve any preliminary flags that will be needed further
        #       in the logic of the _dispatcher() itself
        if wflow['RequestStatus'] == 'announced':
            self.getMSOutputTransferInfo(wflow)

        # Clean:
        # Do not clean any Resubmission, but still let them be archived
        if wflow['RequestType'] == 'Resubmission':
            wflow['ForceArchive'] = True
            msg = "Skipping cleanup step for workflow: %s - RequestType is %s."
            msg += " Will try to archive it directly."
            self.logger.info(msg, wflow['RequestName'], wflow['RequestType'])
        elif wflow['RequestStatus'] in ['rejected', 'aborted-completed']:
            # NOTE: We do not check the ParentageResolved flag for these
            #       workflows, but we do need to clean output data placement
            #       rules from the agents for them
            for pline in self.agentlines:
                try:
                    pline.run(wflow)
                except Exception as ex:
                    msg = "%s: General error from pipeline. Workflow: %s. Error: \n%s. "
                    msg += "\nWill retry again in the next cycle."
                    self.logger.exception(msg, pline.name, wflow['RequestName'], str(ex))
                    continue
                if wflow['CleanupStatus'][pline.name]:
                    self.wfCounters['cleaned'][pline.name] += 1
        elif wflow['RequestStatus'] == 'announced' and not wflow['ParentageResolved']:
            # NOTE: We skip workflows which are not having 'ParentageResolved'
            #       flag, but we still need some proper logging for them.
            msg = "Skipping workflow: %s - 'ParentageResolved' flag set to false."
            msg += " Will retry again in the next cycle."
            self.logger.info(msg, wflow['RequestName'])
        elif wflow['RequestStatus'] == 'announced' and not wflow['TransferDone']:
            # NOTE: We skip workflows which have not yet finalised their TransferStatus
            #       in MSOutput, but we still need some proper logging for them.
            msg = "Skipping workflow: %s - 'TransferStatus' is 'pending' or 'TransferInfo' is missing in MSOutput."
            msg += " Will retry again in the next cycle."
            self.logger.info(msg, wflow['RequestName'])
        elif wflow['RequestStatus'] == 'announced' and not wflow['TransferTape']:
            # NOTE: We skip workflows which have not yet finalised their tape transfers.
            #       (i.e. even if a single output which is supposed to be covered
            #       by a tape rule is in any of the following transient states:
            #       {REPLICATING, STUCK, SUSPENDED, WAITING_APPROVAL}.)
            #       We still need some proper logging for them.
            msg = "Skipping workflow: %s - tape transfers are not yet completed."
            msg += " Will retry again in the next cycle."
            self.logger.info(msg, wflow['RequestName'])
        elif wflow['RequestStatus'] == 'announced':
            for pline in self.cleanuplines:
                try:
                    pline.run(wflow)
                except MSRuleCleanerResolveParentError as ex:
                    msg = "%s: Parentage Resolve Error: %s. "
                    msg += "Will retry again in the next cycle."
                    self.logger.error(msg, pline.name, str(ex))
                    continue
                except Exception as ex:
                    msg = "%s: General error from pipeline. Workflow: %s. Error:  \n%s. "
                    msg += "\nWill retry again in the next cycle."
                    self.logger.exception(msg, pline.name, wflow['RequestName'], str(ex))
                    continue
                if wflow['CleanupStatus'][pline.name]:
                    self.wfCounters['cleaned'][pline.name] += 1
        else:
            # We shouldn't be here:
            msg = "Skipping workflow: %s - "
            msg += "Does not fall under any of the defined categories."
            self.logger.error(msg, wflow['RequestName'])

        # Archive:
        try:
            self.plineArchive.run(wflow)
            if wflow['ForceArchive']:
                self.wfCounters['archived']['forceArchived'] += 1
            else:
                self.wfCounters['archived']['normalArchived'] += 1
        except MSRuleCleanerArchivalSkip as ex:
            msg = "%s: Proper conditions not met: %s. "
            msg += "Skipping archival in the current cycle."
            self.logger.info(msg, wflow['PlineMarkers'][-1], str(ex))
        except MSRuleCleanerArchivalError as ex:
            msg = "%s: Archival Error: %s. "
            msg += "Will retry again in the next cycle."
            self.logger.error(msg, wflow['PlineMarkers'][-1], str(ex))
        except Exception as ex:
            msg = "%s General error from pipeline. Workflow: %s. Error: \n%s. "
            msg += "\nWill retry again in the next cycle."
            self.logger.exception(msg, wflow['PlineMarkers'][-1], wflow['RequestName'], str(ex))

    def setPlineMarker(self, wflow, pName):
        """
        A function intended to mark which is the pipeline currently working
        on the workflow. It is supposed to be called always as a first function
        in the pipeline.
        :param  wflow:   A MSRuleCleaner workflow representation
        :param  pName:   The name of the functional pipeline
        :return:         The workflow object
        """
        # NOTE: The current functional pipeline MUST always be appended at the
        #       end of the 'PlineMarkers' list

        # First get rid of the default:
        if not wflow['PlineMarkers']:
            wflow['PlineMarkers'] = []

        # Then push our current value into the markers list:
        wflow['PlineMarkers'].append(pName)

        # Populate the list of flags to be used later:
        if pName not in wflow['RulesToClean']:
            if pName in self.cleanupPipeNames:
                wflow['RulesToClean'][pName] = []
        if pName not in wflow['CleanupStatus']:
            if pName in self.cleanupPipeNames:
                wflow['CleanupStatus'][pName] = False
        return wflow

    def _checkClean(self, wflow):
        """
        An auxiliary function used to only check the temporary cleanup status.
        It basically takes the pipelines registered in 'PlineMarkers' that have
        already worked on the workflow as a mask and applies this mask over
        the set of flags in the 'CleanupStatus' field and then reduces the
        result to a single bool value
        """
        # NOTE: This is one of the few functions taking a workflow as an argument
        #       but returning a bool, since it is an auxiliary function and is not
        #       supposed to be called as a standalone function in a pipeline.
        # NOTE: `all([]) == True`, ergo all the 'rejected' && 'aborted-completed' workflows
        #       are also counted as properly cleaned and can trigger archival later

        # Build a list of bool flags based on the mask of PlineMarkers
        cleanFlagsList = [wflow['CleanupStatus'][key]
                          for key in wflow['PlineMarkers']
                          if key in wflow['CleanupStatus']]

        # If no one have worked on the workflow set the clean status to false
        if not wflow['PlineMarkers']:
            cleanStatus = False
        # If we have a mask longer than the list of flags avoid false positives
        # because of the behavior explained above - `all([]) == True`
        elif not cleanFlagsList:
            cleanStatus = False
        # Figure out the final value
        else:
            cleanStatus = all(cleanFlagsList)
        return cleanStatus

    def setClean(self, wflow):
        """
        A function to set the 'IsClean' flag based on the status from all the
        pipelines which have worked on the workflow (and have put their markers
        in the 'PlineMarkers' list)
        :param  wflow:      A MSRuleCleaner workflow representation
        :return:            The workflow object
        """
        wflow['IsClean'] = self._checkClean(wflow)
        return wflow

    def _checkLogDBClean(self, wflow):
        """
        An auxiliary function used to only check the LogDB cleanup status.
        It makes a query to LogDB in order to verify there are no any records for
        the current workflow
        :param wflow:       A MSRuleCleaner workflow representation
        :return:            True if no records were found in LogDB about wflow
        """
        cleanStatus = False
        logDBRecords = self.logDB.get(wflow['RequestName'])
        self.logger.debug("logDBRecords: %s", pformat(logDBRecords))
        if not logDBRecords:
            cleanStatus = True
        return cleanStatus

    def setLogDBClean(self, wflow):
        """
        A function to set the 'IsLogDBClean' flag based on the presence of any
        records in LogDB for the current workflow.
        :param  wflow:      A MSRuleCleaner workflow representation
        :return:            The workflow object
        """
        wflow['IsLogDBClean'] = self._checkLogDBClean(wflow)
        if not wflow['IsLogDBClean'] and wflow['IsArchivalDelayExpired']:
            wflow['IsLogDBClean'] = self._cleanLogDB(wflow)
        return wflow

    def _cleanLogDB(self, wflow):
        """
        A function to be used for cleaning all the records related to a workflow in logDB.
        :param wflow:       A MSRuleCleaner workflow representation
        :return:            True if NO errors were encountered while deleting
                            records from LogDB
        """
        cleanStatus = False
        try:
            if self.msConfig['enableRealMode']:
                self.logger.info("Deleting %s records from LogDB WMStats...", wflow['RequestName'])
                res = self.logDB.delete(wflow['RequestName'], agent=False)
                if res == 'delete-error':
                    msg = "Failed to delete logDB docs for wflow: %s" % wflow['RequestName']
                    raise MSRuleCleanerArchivalError(msg)
                cleanStatus = True
            else:
                self.logger.info("DRY-RUN: NOT Deleting %s records from LogDB WMStats...", wflow['RequestName'])
        except Exception as ex:
            msg = "General Exception while cleaning LogDB records for wflow: %s : %s"
            self.logger.exception(msg, wflow['RequestName'], str(ex))
        return cleanStatus

    def findTargetStatus(self, wflow):
        """
        Find the proper targeted archival status
        :param  wflow:      A MSRuleCleaner workflow representation
        :return:            The workflow object
        """
        # Check the available status transitions before we decide the final status
        targetStatusList = RequestStatus.REQUEST_STATE_TRANSITION.get(wflow['RequestStatus'], [])
        for status in targetStatusList:
            if self.targetStatusRegex.match(status):
                wflow['TargetStatus'] = status
        self.logger.debug("TargetStatus: %s", wflow['TargetStatus'])
        return wflow

    def _checkArchDelayExpired(self, wflow):
        """
        A function to check Archival Expiration Delay based on the information
        returned by WMStatsServer regarding the time of the last request status transition
        :param wflow:      MSRuleCleaner workflow representation
        :return:           True if the archival delay have been expired
        """
        archDelayExpired = False
        currentTime = int(time.time())
        threshold = self.msConfig['archiveDelayHours'] * 3600
        try:
            lastTransitionTime = wflow['RequestTransition'][-1]['UpdateTime']
            if lastTransitionTime and (currentTime - lastTransitionTime) > threshold:
                archDelayExpired = True
        except KeyError:
            self.logger.debug("Could not find status transition history for %s", wflow['RequestName'])
        return archDelayExpired

    def setArchivalDelayExpired(self, wflow):
        """
        A function to set the 'IsArchivalDelayExpired' flag
        """
        wflow['IsArchivalDelayExpired'] = self._checkArchDelayExpired(wflow)
        return wflow

    def archive(self, wflow):
        """
        Move the workflow to the proper archived status after checking
        the full cleanup status
        :param  wflow:      A MSRuleCleaner workflow representation
        :return:            The workflow object
        """
        # Make all the needed checks before trying to archive
        if not (wflow['IsClean'] or wflow['ForceArchive']):
            msg = "Not properly cleaned workflow: %s" % wflow['RequestName']
            raise MSRuleCleanerArchivalSkip(msg)
        if not wflow['TargetStatus']:
            msg = "Could not determine which archival status to target for workflow: %s" % wflow['RequestName']
            raise MSRuleCleanerArchivalError(msg)
        if not wflow['IsLogDBClean']:
            msg = "LogDB records have not been cleaned for workflow: %s" % wflow['RequestName']
            raise MSRuleCleanerArchivalSkip(msg)
        if not wflow['IsArchivalDelayExpired']:
            msg = "Archival delay period has not yet expired for workflow: %s." % wflow['RequestName']
            raise MSRuleCleanerArchivalSkip(msg)
        if not self.msConfig['enableRealMode']:
            msg = "Real Run Mode not enabled."
            raise MSRuleCleanerArchivalSkip(msg)

        # Proceed with the actual archival:
        try:
            self.reqmgr2.updateRequestStatus(wflow['RequestName'], wflow['TargetStatus'])
            msg = "Successful status transition to: %s for workflow: %s"
            self.logger.info(msg, wflow['TargetStatus'], wflow['RequestName'])
        except Exception as ex:
            msg = "General Exception while trying status transition to: %s " % wflow['TargetStatus']
            msg += "for workflow: %s : %s" % (wflow['RequestName'], str(ex))
            raise MSRuleCleanerArchivalError(msg)
        return wflow

    def getMSOutputTransferInfo(self, wflow):
        """
        Fetches the transfer information from the MSOutput REST interface for
        the given workflow.
        :param  wflow:   A MSRuleCleaner workflow representation
        :return:         The workflow object
        """
        headers = {'Accept': 'application/json'}
        params = {}
        url = '%s/data/info?request=%s' % (self.msConfig['msOutputUrl'],
                                           wflow['RequestName'])
        try:
            res = self.curlMgr.getdata(url, params=params, headers=headers, ckey=ckey(), cert=cert())
            data = json.loads(res)['result'][0]
            transferInfo = data['transferDoc']
        except Exception as ex:
            msg = "General exception while fetching TransferInfo from MSOutput for %s. "
            msg += "Error: %s"
            self.logger.exception(msg, wflow['RequestName'], str(ex))

        # Set Transfer status - information fetched from MSOutput only
        if transferInfo is not None and transferInfo['TransferStatus'] == 'done':
            wflow['TransferDone'] = True

        # Set Tape rules status - information fetched from Rucio (tape rule ids from MSOutput)
        if transferInfo is not None and transferInfo['OutputMap']:
            tapeRulesStatusList = []
            # For setting 'TransferTape' = True we require either no tape rules for the
            # workflow have been created or all existing tape rules to be in status 'OK',
            # so every empty TapeRuleID we consider as completed.
            for mapRecord in transferInfo['OutputMap']:
                if not mapRecord['TapeRuleID']:
                    continue
                rucioRule = self.rucio.getRule(mapRecord['TapeRuleID'])
                if not rucioRule:
                    tapeRulesStatusList.append(False)
                    msg = "Tape rule: %s not found for workflow: %s "
                    msg += "Possible server side error."
                    self.logger.error(msg, mapRecord['TapeRuleID'], wflow['RequestName'])
                    continue
                if rucioRule['state'] == 'OK':
                    tapeRulesStatusList.append(True)
                    msg = "Tape rule: %s in final state: %s for workflow: %s"
                    self.logger.info(msg, mapRecord['TapeRuleID'], rucioRule['state'], wflow['RequestName'])
                else:
                    tapeRulesStatusList.append(False)
                    msg = "Tape rule: %s in non final state: %s for workflow: %s"
                    self.logger.info(msg, mapRecord['TapeRuleID'], rucioRule['state'], wflow['RequestName'])
            if all(tapeRulesStatusList):
                wflow['TransferTape'] = True

        return wflow

    def setParentDatasets(self, wflow):
        """
        Used to resolve parent datasets for a workflow.
        :param  wflow:   A MSRuleCleaner workflow representation
        :return:         The workflow object
        """
        if wflow['InputDataset'] and wflow['IncludeParents']:
            childDataset = wflow['InputDataset']
            parentDataset = findParent([childDataset], self.msConfig['dbsUrl'])
            # NOTE: If findParent() returned None then the DBS service failed to
            #       resolve the request (it is considered an ERROR outside WMCore)
            if parentDataset.get(childDataset, None) is None:
                msg = "Failed to resolve parent dataset for: %s in workflow: %s" % (childDataset, wflow['RequestName'])
                raise MSRuleCleanerResolveParentError(msg)
            elif parentDataset:
                wflow['ParentDataset'] = [parentDataset[childDataset]]
                msg = "Found parent %s for input dataset %s in workflow: %s "
                self.logger.info(msg, parentDataset, wflow['InputDataset'], wflow['RequestName'])
            else:
                msg = "Could not find parent for input dataset: %s in workflows: %s"
                self.logger.error(msg, wflow['InputDataset'], wflow['RequestName'])
        return wflow

    def getRucioRules(self, wflow, gran, rucioAcct):
        """
        Queries Rucio and builds the relevant list of blocklevel rules for
        the given workflow
        :param  wflow:   A MSRuleCleaner workflow representation
        :param  gran:    Data granularity to search for Rucio rules. Possible values:
                         'block' or 'container'
        :return:         The workflow object
        """
        currPline = wflow['PlineMarkers'][-1]

        # Create the container list to the rucio account map and set the checkGlobalLocks flag.
        mapRuleType = {self.msConfig['rucioWmaAccount']: ["OutputDatasets"],
                       self.msConfig['rucioMStrAccount']: ["InputDataset", "MCPileup",
                                                           "DataPileup", "ParentDataset"]}
        if rucioAcct == self.msConfig['rucioMStrAccount']:
            checkGlobalLocks = True
        else:
            checkGlobalLocks = False

        # Find all the data placement rules created by the components:
        for dataType in mapRuleType[rucioAcct]:
            dataList = wflow[dataType] if isinstance(wflow[dataType], list) else [wflow[dataType]]
            for dataCont in dataList:
                if dataCont is None:
                    continue
                self.logger.debug("getRucioRules: dataCont: %s", pformat(dataCont))
                if checkGlobalLocks and dataCont in self.globalLocks:
                    msg = "Found dataset: %s in GlobalLocks. NOT considering it for filling the "
                    msg += "RulesToClean list for both container and block level Rules for workflow: %s!"
                    self.logger.info(msg, dataCont, wflow['RequestName'])
                    continue
                if gran == 'container':
                    for rule in self.rucio.listDataRules(dataCont, account=rucioAcct):
                        wflow['RulesToClean'][currPline].append(rule['id'])
                        msg = "Found %s container-level rule to be deleted for container %s"
                        self.logger.info(msg, rule['id'], dataCont)
                elif gran == 'block':
                    try:
                        blocks = self.rucio.getBlocksInContainer(dataCont)
                        for block in blocks:
                            for rule in self.rucio.listDataRules(block, account=rucioAcct):
                                wflow['RulesToClean'][currPline].append(rule['id'])
                                msg = "Found %s block-level rule to be deleted for container %s"
                                self.logger.info(msg, rule['id'], dataCont)
                    except WMRucioDIDNotFoundException:
                        msg = "Container: %s not found in Rucio for workflow: %s."
                        self.logger.info(msg, dataCont, wflow['RequestName'])
        return wflow

    def cleanRucioRules(self, wflow):
        """
        Cleans all the Rules present in the field 'RulesToClean' in the MSRuleCleaner
        workflow representation. And fills the relevant Cleanup Status.
        :param wflow:   A MSRuleCleaner workflow representation
        :return:        The workflow object
        """
        # NOTE: The function should be called independently and sequentially from
        #       The Input and the respective BlockLevel pipelines.

        # NOTE: The current functional pipeline is always the last one in the PlineMarkers list
        currPline = wflow['PlineMarkers'][-1]
        delResults = []
        if self.msConfig['enableRealMode']:
            for rule in wflow['RulesToClean'][currPline]:
                self.logger.info("%s: Deleting ruleId: %s ", currPline, rule)
                delResult = self.rucio.deleteRule(rule)
                delResults.append(delResult)
                if not delResult:
                    self.logger.warning("%s: Failed to delete ruleId: %s ", currPline, rule)
        else:
            for rule in wflow['RulesToClean'][currPline]:
                delResults.append(True)
                self.logger.info("%s: DRY-RUN: Is about to delete ruleId: %s ", currPline, rule)

        # Set the cleanup flag:
        wflow['CleanupStatus'][currPline] = all(delResults)
        return wflow

    def getRequestRecords(self, reqStatus):
        """
        Queries ReqMgr2 for requests in a given status.
        :param reqStatus: The status for the requests to be fetched from ReqMgr2
        :return requests: A dictionary with all the workflows in the given status
        """
        self.logger.info("Fetching requests in status: %s", reqStatus)
        result = self.reqmgr2.getRequestByStatus([reqStatus], detail=True)
        if not result:
            requests = {}
        else:
            requests = result[0]
        self.logger.info('  retrieved %s requests in status: %s', len(requests), reqStatus)
        return requests
Esempio n. 7
0
    def msOutputProducer(self, requestRecords):
        """
        A top level function to drive the upload of all the documents to MongoDB
        """

        # DONE:
        #    To implement this as a functional pipeline in the following sequence:
        #    1) document streamer - to generate all the records coming from Reqmgr2
        #    2) document stripper - to cut all the cut all the kews we do not need
        #       Mongodb document creator - to pass it through the MongoDBTemplate
        #    3) document updater - fetch & update all the needed info like campaign config etc.
        #    4) MongoDB upload/update - to upload/update the document in Mongodb

        # DONE:
        #    to have the requestRecords generated through a call to docStreamer
        #    and the call should happen from inside this function so that all
        #    the Objects generated do not leave the scope of this function and
        #    with that  to reduce big memory footprint

        # DONE:
        #    to set a destructive function at the end of the pipeline
        # NOTE:
        #    To discuss the collection names
        # NOTE:
        #    Here we should never use docUploader with `update=True`, because
        #    this will erase the latest state of already existing and fully or
        #    partially processed documents by the Consumer pipeline
        self.logger.info("Running the msOutputProducer ...")
        msPipelineRelVal = Pipeline(name="MSOutputProducer PipelineRelVal",
                                    funcLine=[
                                        Functor(self.docTransformer),
                                        Functor(self.docKeyUpdate,
                                                isRelVal=True),
                                        Functor(self.docInfoUpdate,
                                                pipeLine='PipelineRelVal'),
                                        Functor(self.docUploader,
                                                self.msOutRelValColl),
                                        Functor(self.docCleaner)
                                    ])
        msPipelineNonRelVal = Pipeline(
            name="MSOutputProducer PipelineNonRelVal",
            funcLine=[
                Functor(self.docTransformer),
                Functor(self.docKeyUpdate, isRelVal=False),
                Functor(self.docInfoUpdate, pipeLine='PipelineNonRelVal'),
                Functor(self.docUploader, self.msOutNonRelValColl),
                Functor(self.docCleaner)
            ])
        # TODO:
        #    To generate the object from within the Function scope see above.
        counter = 0
        for _, request in requestRecords:
            counter += 1
            try:
                if request.get('SubRequestType') == 'RelVal':
                    pipeLine = msPipelineRelVal
                    pipeLineName = pipeLine.getPipelineName()
                    pipeLine.run(request)
                else:
                    pipeLine = msPipelineNonRelVal
                    pipeLineName = pipeLine.getPipelineName()
                    pipeLine.run(request)
            except KeyError as ex:
                msg = "%s Possibly broken read from Reqmgr2 API or other Err: %s. " % (
                    pipeLineName, str(ex))
                msg += "Continue to the next document."
                self.logger.exception(msg)
                continue
            except TypeError as ex:
                msg = "%s Possibly broken read from Reqmgr2 API or other Err: %s. " % (
                    pipeLineName, str(ex))
                msg += "Continue to the next document."
                self.logger.exception(msg)
                continue
            except Exception as ex:
                msg = "%s General Error from pipeline. Err: %s. " % (
                    pipeLineName, str(ex))
                msg += "Giving up Now."
                self.logger.error(msg)
                self.logger.exception(ex)
                break
        return counter
Esempio n. 8
0
    def msOutputConsumer(self):
        """
        A top level function to drive the creation and book keeping of all the
        subscriptions to the Data Management System
        """
        # DONE:
        #    Done: To check if the 'enableDataPlacement' flag is really taken into account
        #    Done: To make this for both relvals and non relvals
        #    Done: To return the result
        #    Done: To make report document
        #    Done: To build it through a pipe
        #    Done: To write back the updated document to MonogoDB
        msPipelineRelVal = Pipeline(name="MSOutputConsumer PipelineRelVal",
                                    funcLine=[
                                        Functor(self.docReadfromMongo,
                                                self.msOutRelValColl,
                                                setTaken=False),
                                        Functor(self.makeSubscriptions),
                                        Functor(self.docKeyUpdate,
                                                isTaken=False,
                                                isTakenBy=None,
                                                lastUpdate=int(time())),
                                        Functor(self.docUploader,
                                                self.msOutRelValColl,
                                                update=True,
                                                keys=[
                                                    'isTaken', 'lastUpdate',
                                                    'transferStatus',
                                                    'transferIDs'
                                                ]),
                                        Functor(self.docDump,
                                                pipeLine='PipelineRelVal'),
                                        Functor(self.docCleaner)
                                    ])
        msPipelineNonRelVal = Pipeline(
            name="MSOutputConsumer PipelineNonRelVal",
            funcLine=[
                Functor(self.docReadfromMongo,
                        self.msOutNonRelValColl,
                        setTaken=False),
                Functor(self.makeSubscriptions),
                Functor(self.docKeyUpdate,
                        isTaken=False,
                        isTakenBy=None,
                        lastUpdate=int(time())),
                Functor(self.docUploader,
                        self.msOutNonRelValColl,
                        update=True,
                        keys=[
                            'isTaken', 'lastUpdate', 'transferStatus',
                            'transferIDs'
                        ]),
                Functor(self.docDump, pipeLine='PipelineNonRelVal'),
                Functor(self.docCleaner)
            ])

        # NOTE:
        #    If we actually have any exception that has reached to the top level
        #    exception handlers (eg. here - outside the pipeLine), this means
        #    some function from within the pipeLine has not caught it and the msOutDoc
        #    has left the pipe and died before the relevant document in MongoDB
        #    has been released (its flag 'isTaken' to be set back to False)
        wfCounters = {}
        for pipeLine in [msPipelineRelVal, msPipelineNonRelVal]:
            pipeLineName = pipeLine.getPipelineName()
            wfCounters[pipeLineName] = 0
            while wfCounters[pipeLineName] < self.msConfig[
                    'limitRequestsPerCycle']:
                # take only workflows:
                # - which are not already taken or
                # - a transfer subscription have never been done for them and
                # - avoid retrying workflows in the same cycle
                # NOTE:
                #    Once we are running the service not in a dry run mode we may
                #    consider adding and $or condition in mQueryDict for transferStatus:
                #    '$or': [{'transferStatus': None},
                #            {'transferStatus': 'incomplete'}]
                #    So that we can collect also workflows with partially or fully
                #    unsuccessful transfers
                currTime = int(time())
                treshTime = currTime - self.msConfig['interval']
                mQueryDict = {
                    '$and': [{
                        'isTaken': False
                    }, {
                        '$or': [{
                            'transferStatus': None
                        }, {
                            'transferStatus': 'incomplete'
                        }]
                    }, {
                        '$or': [{
                            'lastUpdate': None
                        }, {
                            'lastUpdate': {
                                '$lt': treshTime
                            }
                        }]
                    }]
                }

                # FIXME:
                #    To redefine those exceptions as MSoutputExceptions and
                #    start using those here so we do not mix with general errors
                try:
                    pipeLine.run(mQueryDict)
                except KeyError as ex:
                    msg = "%s Possibly malformed record in MongoDB. Err: %s. " % (
                        pipeLineName, str(ex))
                    msg += "Continue to the next document."
                    self.logger.exception(msg)
                    continue
                except TypeError as ex:
                    msg = "%s Possibly malformed record in MongoDB. Err: %s. " % (
                        pipeLineName, str(ex))
                    msg += "Continue to the next document."
                    self.logger.exception(msg)
                    continue
                except EmptyResultError as ex:
                    msg = "%s All relevant records in MongoDB exhausted. " % pipeLineName
                    msg += "We are done for the current cycle."
                    self.logger.info(msg)
                    break
                except Exception as ex:
                    msg = "%s General Error from pipeline. Err: %s. " % (
                        pipeLineName, str(ex))
                    msg += "Giving up Now."
                    self.logger.error(msg)
                    self.logger.exception(ex)
                    break
                wfCounters[pipeLineName] += 1

        wfCounterTotal = sum(wfCounters.values())
        return wfCounterTotal
Esempio n. 9
0
    def msOutputConsumer(self):
        """
        A top level function to drive the creation and book keeping of all the
        subscriptions to the Data Management System
        """
        # DONE:
        #    Done: To check if the 'enableDataPlacement' flag is really taken into account
        #    Done: To make this for both relvals and non relvals
        #    Done: To return the result
        #    Done: To make report document
        #    Done: To build it through a pipe
        #    Done: To write back the updated document to MonogoDB
        msPipelineRelVal = Pipeline(
            name="MSOutputConsumer PipelineRelVal",
            funcLine=[
                Functor(self.makeSubscriptions),
                Functor(self.makeTapeSubscriptions),
                Functor(self.docUploader,
                        update=True,
                        keys=['LastUpdate', 'TransferStatus', 'OutputMap']),
                Functor(self.docDump, pipeLine='PipelineRelVal'),
                Functor(self.docCleaner)
            ])
        msPipelineNonRelVal = Pipeline(
            name="MSOutputConsumer PipelineNonRelVal",
            funcLine=[
                Functor(self.makeSubscriptions),
                Functor(self.makeTapeSubscriptions),
                Functor(self.docUploader,
                        update=True,
                        keys=['LastUpdate', 'TransferStatus', 'OutputMap']),
                Functor(self.docDump, pipeLine='PipelineNonRelVal'),
                Functor(self.docCleaner)
            ])

        wfCounterTotal = 0
        mQueryDict = {'TransferStatus': 'pending'}
        pipeCollections = [(msPipelineRelVal, self.msOutRelValColl),
                           (msPipelineNonRelVal, self.msOutNonRelValColl)]
        for pipeColl in pipeCollections:
            wfCounters = 0
            pipeLine = pipeColl[0]
            dbColl = pipeColl[1]
            pipeLineName = pipeLine.getPipelineName()
            for docOut in self.getDocsFromMongo(
                    mQueryDict, dbColl,
                    self.msConfig['limitRequestsPerCycle']):
                # FIXME:
                #    To redefine those exceptions as MSoutputExceptions and
                #    start using those here so we do not mix with general errors
                try:
                    # If it's in MongoDB, it can get into our in-memory cache
                    self.requestNamesCached.append(docOut['RequestName'])
                    pipeLine.run(docOut)
                except (KeyError, TypeError) as ex:
                    msg = "%s Possibly malformed record in MongoDB. Err: %s. " % (
                        pipeLineName, str(ex))
                    msg += "Continue to the next document."
                    self.logger.exception(msg)
                    continue
                except EmptyResultError as ex:
                    msg = "%s All relevant records in MongoDB exhausted. " % pipeLineName
                    msg += "We are done for the current cycle."
                    self.logger.info(msg)
                    break
                except Exception as ex:
                    msg = "%s General error from pipeline. Err: %s. " % (
                        pipeLineName, str(ex))
                    msg += "Will retry again in the next cycle."
                    self.logger.exception(msg)
                    break
                wfCounters += 1
            self.logger.info("Processed %d workflows from pipeline: %s",
                             wfCounters, pipeLineName)
            wfCounterTotal += wfCounters

        return wfCounterTotal
Esempio n. 10
0
class MSUnmerged(MSCore):
    """
    MSUnmerged.py class provides the logic for cleaning the unmerged area of
    the CMS LFN Namespace.
    """
    def __init__(self, msConfig, logger=None):
        """
        Runs the basic setup and initialization for the MSUnmerged module
        :param msConfig: micro service configuration
        """
        super(MSUnmerged, self).__init__(msConfig, logger=logger)

        self.msConfig.setdefault("verbose", True)
        self.msConfig.setdefault("interval", 60)
        # self.msConfig.setdefault('limitRSEsPerInstance', 100)
        # self.msConfig.setdefault('limitTiersPerInstance', ['T1', 'T2', 'T3'])
        # self.msConfig.setdefault("rucioAccount", "FIXME_RUCIO_ACCT")
        self.msConfig.setdefault("rseExpr", "*")
        # TODO: Add 'alertManagerUrl' to msConfig'
        # self.alertServiceName = "ms-unmerged"
        # self.alertManagerAPI = AlertManagerAPI(self.msConfig.get("alertManagerUrl", None), logger=logger)

        # Building all the Pipelines:
        pName = 'plineUnmerged'
        self.plineUnmerged = Pipeline(name=pName,
                                      funcLine=[Functor(self.cleanFiles)])
        # Initialization of the deleted files counters:
        self.rseCounters = {}
        self.plineCounters = {}

    def execute(self):
        """
        Executes the whole MSUnmerged logic
        :return: summary
        """
        # start threads in MSManager which should call this method
        summary = dict(UNMERGED_REPORT)

        self.resetCounters()

        try:
            rseList = self.getRSEList()
            self.updateReportDict(summary, "total_num_rses", len(rseList))
            msg = "  retrieved %s RSEs. " % len(rseList)
            msg += "Service set to process up to %s RSEs per instance." % self.msConfig[
                "limitRSEsPerInstance"]
            self.logger.info(msg)
        except Exception as err:  # general error
            msg = "Unknown exception while trying to estimate the final RSEs to work on. Error: %s", str(
                err)
            self.logger.exception(msg)
            self.updateReportDict(summary, "error", msg)

        try:
            totalNumRses, totalNumFiles, numRsesCleaned, numFilesDeleted = self._execute(
                rseList)
            msg = "\nTotal number of processed RSEs: %s."
            msg += "\nTotal number of files to be deleted: %s."
            msg += "\nNumber of RSEs cleaned: %s."
            msg += "\nNumber of files deleted: %s."
            self.logger.info(msg, totalNumRses, totalNumFiles, numRsesCleaned,
                             numFilesDeleted)
            self.updateReportDict(summary, "total_num_rses", totalNumRses)
            self.updateReportDict(summary, "total_num_files", totalNumFiles)
            self.updateReportDict(summary, "num_rses_cleaned", numRsesCleaned)
            self.updateReportDict(summary, "num_files_deleted",
                                  numFilesDeleted)
        except Exception as ex:
            msg = "Unknown exception while running MSUnmerged thread Error: %s"
            self.logger.exception(msg, str(ex))
            self.updateReportDict(summary, "error", msg)

        return summary

    def _execute(self, rseList):
        """
        Executes the MSUnmerged pipelines
        :param rseList: A list of RSEs to work on
        :return:        a tuple with:
                            total number of RSEs
                            total number of files found for deletion
                            number of RSEs cleaned
                            number of files deleted
        """
        totalNumRses = 0
        totalNumFiles = 0
        numRsesCleaned = 0
        numFilesDeleted = 0

        # Call the workflow dispatcher:
        for rse in rseList:
            try:
                rse = MSUnmergedRSE(rse)
                self.plineUnmerged.run(rse)
                msg = "\n----------------------------------------------------------"
                msg += "\nMSUnmergedRSE: %s"
                msg += "\n----------------------------------------------------------"
                self.logger.debug(msg, pformat(rse))
                totalNumRses += 1
                if rse['isClean']:
                    numRsesCleaned += 1
                totalNumFiles += rse['counters']['totalNumFiles']
                numFilesDeleted += rse['counters']['numFilesDeleted']
            except Exception as ex:
                msg = "%s: General error from pipeline. RSE: %s. Error:  \n%s. "
                msg += "\nWill retry again in the next cycle."
                self.logger.exception(msg, self.plineUnmerged.name, rse['rse'],
                                      str(ex))
                continue
        return totalNumRses, totalNumFiles, numRsesCleaned, numFilesDeleted

    def cleanFiles(self, rse):
        """
        The method to implement the actual deletion of files for an RSE.
        :param rse: MSUnmergedRSE object to be cleaned
        :return:    The MSUnmergedRSE object
        """
        try:
            # for fileUnmerged in rse['files']['toDelete']:
            #     try:
            #         self.gfalCommand(rse['delInterface'], fileUnmerged)
            #         rse['counters']['numFilesDeleted'] += 1
            #         rse['files']['deletedSuccess'].append(fileUnmerged)
            #     except Exception as ex:
            #         rse['files']['deletedFail'].append(fileUnmerged)
            #         msg = "Error while trying to delete file: %s for RSE: %s"
            #         msg += "Will retry in the next cycle. Err: %s"
            #         self.logger.debug(msg, fileUnmerged, rse['name'], str(ex))
            rse['isClean'] = self._checkClean(rse)
        except Exception as ex:
            msg = "Error while cleaning RSE: %s"
            msg += "Will retry in the next cycle. Err: %s"
            self.logger.debug(msg, rse['name'], str(ex))

        return rse

    def _checkClean(self, rse):
        """
        A simple function to check if every file in an RSE's unmerged area have
        been deleted
        :param rse: The RSE to be checked
        :return:    Bool: True if all files found have been deleted, False otherwise
        """
        return rse['counters']['totalNumFiles'] == rse['counters'][
            'numFilesDeleted']

    def resetCounters(self):
        """
        A simple function for zeroing the deleted files counters.
        """

        for rse in self.rseCounters:
            self.rseCounters[rse]['deletedSuccess'] = 0
            self.rseCounters[rse]['deletedFail'] = 0

        for pline in self.plineCounters:
            self.plineCounters[pline.name]['deletedSuccess'] = 0
            self.plineCounters[pline.name]['deletedFail'] = 0

    def getRSEList(self):
        """
        Queries Rucio for the proper RSE list to iterate through.
        :return: List of RSE names.
        """
        try:
            rseList = self.rucio.evaluateRSEExpression(
                self.msConfig['rseExpr'])
        except Exception as ex:
            msg = "Unknown exception while trying to fetch the initial list of RSEs to work on. Err: %s"
            self.logger.exception(msg, str(ex))
        return rseList