Exemple #1
0
class UserJobFinalization(ModuleBase):
    """ Finalization of user jobs
  """

    #############################################################################
    def __init__(self, bkClient=None, dm=None):
        """Module initialization.
    """

        self.log = gLogger.getSubLogger("UserJobFinalization")
        super(UserJobFinalization, self).__init__(self.log,
                                                  bkClientIn=bkClient,
                                                  dm=dm)

        self.version = __RCSID__
        self.enable = True
        self.defaultOutputSE = resolveSEGroup(
            gConfig.getValue('/Resources/StorageElementGroups/Tier1-USER', []))
        self.failoverSEs = resolveSEGroup(
            gConfig.getValue('/Resources/StorageElementGroups/Tier1-Failover',
                             []))
        # List all parameters here
        self.request = None
        # Always allow any files specified by users
        self.outputDataFileMask = ''
        self.userOutputData = []
        self.userOutputSE = ''
        self.userOutputPath = ''
        self.failoverTransfer = None
        self.replicateUserOutputData = False
        self.userPrependString = ''

    #############################################################################
    def _resolveInputVariables(self):
        """ By convention the module parameters are resolved here.
    """
        super(UserJobFinalization, self)._resolveInputVariables()

        # Use LHCb utility for local running via dirac-jobexec
        if 'UserOutputData' in self.workflow_commons:
            userOutputData = self.workflow_commons['UserOutputData']
            if not isinstance(userOutputData, list):
                userOutputData = [i.strip() for i in userOutputData.split(';')]
            self.userOutputData = userOutputData

        if 'UserOutputSE' in self.workflow_commons:
            specifiedSE = self.workflow_commons['UserOutputSE']
            if not isinstance(specifiedSE, list):
                self.userOutputSE = [i.strip() for i in specifiedSE.split(';')]
        else:
            self.log.verbose(
                'No UserOutputSE specified, using default value: %s' %
                (', '.join(self.defaultOutputSE)))
            self.userOutputSE = []

        if 'UserOutputPath' in self.workflow_commons:
            self.userOutputPath = self.workflow_commons['UserOutputPath']

        if 'ReplicateUserOutputData' in self.workflow_commons and self.workflow_commons[
                'ReplicateUserOutputData']:
            self.replicateUserOutputData = True

        if 'UserOutputLFNPrepend' in self.workflow_commons:
            self.userPrependString = self.workflow_commons[
                'UserOutputLFNPrepend']

    #############################################################################

    def execute(self,
                production_id=None,
                prod_job_id=None,
                wms_job_id=None,
                workflowStatus=None,
                stepStatus=None,
                wf_commons=None,
                step_commons=None,
                step_number=None,
                step_id=None,
                orderedSEs=None):
        """ Main execution function.
    """

        try:

            super(UserJobFinalization,
                  self).execute(self.version, production_id, prod_job_id,
                                wms_job_id, workflowStatus, stepStatus,
                                wf_commons, step_commons, step_number, step_id)

            self._resolveInputVariables()

            # Earlier modules may have populated the report objects
            self.request.RequestName = 'job_%d_request.xml' % self.jobID
            self.request.JobID = self.jobID
            self.request.SourceComponent = "Job_%d" % self.jobID

            if not self._checkWFAndStepStatus():
                return S_OK()

            if not self.userOutputData:
                self.log.info(
                    "No user output data is specified for this job, nothing to do"
                )
                return S_OK("No output data to upload")

            self.log.info("User specified output file list is: %s" %
                          (', '.join(self.userOutputData)))

            globList = []
            for i in self.userOutputData:
                if re.search('\*', i):
                    globList.append(i)

            # Check whether list of userOutputData is a globbable pattern
            if globList:
                for i in globList:
                    self.userOutputData.remove(i)

                globbedOutputList = list(set(getGlobbedFiles(globList)))
                if globbedOutputList:
                    self.log.info(
                        'Found a pattern in the output data file list, \
          extra files to upload are: %s' % (', '.join(globbedOutputList)))
                    self.userOutputData += globbedOutputList
                else:
                    self.log.info(
                        "No files were found on the local disk for the following patterns: %s"
                        % (', '.join(globList)))

            self.log.info("Final list of files to upload are: %s" %
                          (', '.join(self.userOutputData)))

            # Determine the final list of possible output files for the workflow and all the parameters needed to upload them.
            outputList = []
            for i in self.userOutputData:
                outputList.append({
                    'outputDataType': ('.'.split(i)[-1]).upper(),
                    'outputDataName': os.path.basename(i)
                })

            userOutputLFNs = []
            if self.userOutputData:
                self.log.info("Constructing user output LFN(s) for %s" %
                              (', '.join(self.userOutputData)))

                userOutputLFNs = constructUserLFNs(self.jobID,
                                                   self._getCurrentOwner(),
                                                   self.userOutputData,
                                                   self.userOutputPath,
                                                   self.userPrependString)

            self.log.verbose(
                "Calling getCandidateFiles( %s, %s, %s)" %
                (outputList, userOutputLFNs, self.outputDataFileMask))
            try:
                fileDict = self.getCandidateFiles(outputList, userOutputLFNs,
                                                  self.outputDataFileMask)
            except os.error as e:
                self.setApplicationStatus(e)
                return S_OK()

            try:
                fileMetadata = self.getFileMetadata(fileDict)
            except RuntimeError as e:
                self.setApplicationStatus(e)
                return S_OK()

            if not fileMetadata:
                self.log.info(
                    "No output data files were determined to be uploaded for this workflow"
                )
                self.setApplicationStatus('No Output Data Files To Upload')
                return S_OK()

            if not orderedSEs:
                orderedSEs = self._getOrderedSEsList()

            self.log.info("Ordered list of output SEs is: %s" %
                          (', '.join(orderedSEs)))
            final = {}
            for fileName, metadata in fileMetadata.items():
                final[fileName] = metadata
                final[fileName]['resolvedSE'] = orderedSEs

            # At this point can exit and see exactly what the module will upload
            if not self._enableModule():
                self.log.info(
                    "Module disabled would have attempted to upload the files %s"
                    % ', '.join(final.keys()))
                for fileName, metadata in final.items():
                    self.log.info('--------%s--------' % fileName)
                    for n, v in metadata.items():
                        self.log.info('%s = %s' % (n, v))

                return S_OK("Module is disabled by control flag")

            # Disable the watchdog check in case the file uploading takes a long time
            self._disableWatchdogCPUCheck()

            # Instantiate the failover transfer client with the global request object
            if not self.failoverTransfer:
                self.failoverTransfer = FailoverTransfer(self.request)

            # One by one upload the files with failover if necessary
            replication = {}
            failover = {}
            uploaded = []
            for fileName, metadata in final.items():
                self.log.info(
                    "Attempting to store %s to the following SE(s): %s" %
                    (fileName, ', '.join(metadata['resolvedSE'])))
                fileMetaDict = {
                    'Size': metadata['filedict']['Size'],
                    'LFN': metadata['filedict']['LFN'],
                    'GUID': metadata['filedict']['GUID'],
                    'Checksum': metadata['filedict']['Checksum'],
                    'ChecksumType': metadata['filedict']['ChecksumType']
                }
                result = self.failoverTransfer.transferAndRegisterFile(
                    fileName=fileName,
                    localPath=metadata['localpath'],
                    lfn=metadata['filedict']['LFN'],
                    destinationSEList=metadata['resolvedSE'],
                    fileMetaDict=fileMetaDict,
                    masterCatalogOnly=True)
                if not result['OK']:
                    self.log.error(
                        "Could not transfer and register %s with metadata:\n %s"
                        % (fileName, metadata))
                    failover[fileName] = metadata
                else:
                    # Only attempt replication after successful upload
                    lfn = metadata['lfn']
                    uploaded.append(lfn)
                    seList = metadata['resolvedSE']
                    replicateSE = ''
                    uploadedSE = result['Value'].get('uploadedSE', '')
                    if uploadedSE:
                        for se in seList:
                            if not se == uploadedSE:
                                replicateSE = se
                                break

                    if replicateSE and lfn and self.replicateUserOutputData:
                        self.log.info("Will attempt to replicate %s to %s" %
                                      (lfn, replicateSE))
                        replication[lfn] = (uploadedSE, replicateSE,
                                            fileMetaDict)

            cleanUp = False
            for fileName, metadata in failover.items():
                random.shuffle(self.failoverSEs)
                targetSE = metadata['resolvedSE'][0]
                if len(metadata['resolvedSE']) > 1:
                    replicateSE = metadata['resolvedSE'][1]
                else:
                    replicateSE = ''
                metadata['resolvedSE'] = self.failoverSEs
                fileMetaDict = {
                    'Size': metadata['filedict']['Size'],
                    'LFN': metadata['filedict']['LFN'],
                    'GUID': metadata['filedict']['GUID']
                }
                result = self.failoverTransfer.transferAndRegisterFileFailover(
                    fileName,
                    metadata['localpath'],
                    metadata['lfn'],
                    targetSE,
                    metadata['resolvedSE'],
                    fileMetaDict=fileMetaDict,
                    masterCatalogOnly=True)
                if not result['OK']:
                    self.log.error(
                        "Could not transfer and register %s with metadata:\n %s"
                        % (fileName, metadata))
                    cleanUp = True
                    continue  # for users can continue even if one completely fails
                else:
                    lfn = metadata['lfn']
                    uploaded.append(lfn)
                    # Even when using Failover, one needs to replicate to a second SE
                    if replicateSE and self.replicateUserOutputData:
                        replication[lfn] = (targetSE, replicateSE,
                                            fileMetaDict)

            # For files correctly uploaded must report LFNs to job parameters
            if uploaded:
                report = ', '.join(uploaded)
                self.setJobParameter('UploadedOutputData', report)

            # Now after all operations, retrieve potentially modified request object
            self.request = self.failoverTransfer.request

            # If some or all of the files failed to be saved to failover
            if cleanUp:
                self.workflow_commons['Request'] = self.request
                # Leave any uploaded files just in case it is useful for the user
                # do not try to replicate any files.
                return S_ERROR("Failed To Upload Output Data")

            for lfn, (uploadedSE, repSE,
                      fileMetaDictItem) in replication.items():
                self.failoverTransfer._setFileReplicationRequest(
                    lfn, repSE, fileMetaDictItem, uploadedSE)

            self.workflow_commons['Request'] = self.failoverTransfer.request

            self.generateFailoverFile()

            self.setApplicationStatus("Job Finished Successfully")

            return S_OK('Output data uploaded')

        except Exception as e:  # pylint:disable=broad-except
            self.log.exception("Failure in UserJobFinalization execute module",
                               lException=e)
            self.setApplicationStatus(repr(e))
            return S_ERROR(str(e))

        finally:
            super(UserJobFinalization, self).finalize(self.version)

    #############################################################################

    def _getOrderedSEsList(self):
        """ Returns list of ordered SEs to which trying to upload
    """
        # FIXME: remove all banned SEs (not the force ones)
        # First get the local (or assigned) SE to try first for upload and others in random fashion
        localSEs = set(
            getDestinationSEList('Tier1-USER',
                                 self.siteName,
                                 outputmode='local'))
        self.log.verbose("Site Local SE for user outputs is: %s" %
                         (list(localSEs)))
        userSEs = set(self.userOutputSE)
        otherSEs = set(self.defaultOutputSE) - localSEs - userSEs
        # If a user SE is  local set it first
        topSEs = userSEs & localSEs
        # reordered user SEs, setting local first
        userSEs = list(topSEs) + list(userSEs - topSEs)
        localSEs = list(localSEs - topSEs)
        if len(userSEs) < 2 and localSEs:
            # Set a local SE first
            orderedSEs = localSEs[0:1] + userSEs + localSEs[1:]
        else:
            orderedSEs = userSEs + localSEs
        random.shuffle(list(otherSEs))
        orderedSEs += otherSEs

        return orderedSEs

    def _getCurrentOwner(self):
        """Simple function to return current DIRAC username.
    """
        if 'OwnerName' in self.workflow_commons:
            return self.workflow_commons['OwnerName']

        result = getProxyInfo()

        if not result['OK']:
            if not self._enableModule():
                return 'testUser'
            raise RuntimeError('Could not obtain proxy information')

        if 'username' not in result['Value']:
            raise RuntimeError('Could not get username from proxy')

        return result['Value']['username']